diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -656,10 +656,17 @@ /// Return true if the target supports nontemporal load. bool isLegalNTLoad(Type *DataType, Align Alignment) const; - /// Return true if the target supports masked scatter. - bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; + /// Return true if masked gather should be used for vectorization. + bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask, + Align Alignment) const; + /// Return true if masked scatter should be used for vectorization. + bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask, + Align Alignment) const; + /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType, Align Alignment) const; + /// Return true if the target supports masked scatter. + bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked compress store. bool isLegalMaskedCompressStore(Type *DataType) const; @@ -1513,8 +1520,14 @@ virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; - virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; + virtual bool shouldUseMaskedGatherForVectorization(Type *DataType, + bool VariableMask, + Align Alignment) = 0; + virtual bool shouldUseMaskedScatterForVectorization(Type *DataType, + bool VariableMask, + Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; virtual bool enableOrderedReductions() = 0; @@ -1893,12 +1906,22 @@ bool isLegalNTLoad(Type *DataType, Align Alignment) override { return Impl.isLegalNTLoad(DataType, Alignment); } - bool isLegalMaskedScatter(Type *DataType, Align Alignment) override { - return Impl.isLegalMaskedScatter(DataType, Alignment); + bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask, + Align Alignment) override { + return Impl.shouldUseMaskedGatherForVectorization(DataType, VariableMask, + Alignment); + } + bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask, + Align Alignment) override { + return Impl.shouldUseMaskedScatterForVectorization(DataType, VariableMask, + Alignment); } bool isLegalMaskedGather(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedGather(DataType, Alignment); } + bool isLegalMaskedScatter(Type *DataType, Align Alignment) override { + return Impl.isLegalMaskedScatter(DataType, Alignment); + } bool isLegalMaskedCompressStore(Type *DataType) override { return Impl.isLegalMaskedCompressStore(DataType); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -862,6 +862,16 @@ public: using BaseT::getGEPCost; + bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask, + Align Alignment) { + return static_cast(this)->isLegalMaskedGather(DataType, Alignment); + } + + bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask, + Align Alignment) { + return static_cast(this)->isLegalMaskedScatter(DataType, Alignment); + } + InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -393,6 +393,18 @@ return TTIImpl->isLegalNTLoad(DataType, Alignment); } +bool TargetTransformInfo::shouldUseMaskedGatherForVectorization( + Type *DataType, bool VariableMask, Align Alignment) const { + return TTIImpl->shouldUseMaskedGatherForVectorization(DataType, VariableMask, + Alignment); +} + +bool TargetTransformInfo::shouldUseMaskedScatterForVectorization( + Type *DataType, bool VariableMask, Align Alignment) const { + return TTIImpl->shouldUseMaskedScatterForVectorization(DataType, VariableMask, + Alignment); +} + bool TargetTransformInfo::isLegalMaskedGather(Type *DataType, Align Alignment) const { return TTIImpl->isLegalMaskedGather(DataType, Alignment); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -227,6 +227,10 @@ bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); + bool shouldUseMaskedGatherForVectorization(Type *DataType, bool VariableMask, + Align Alignment); + bool shouldUseMaskedScatterForVectorization(Type *DataType, bool VariableMask, + Align Alignment); bool isLegalMaskedGather(Type *DataType, Align Alignment); bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4871,6 +4871,21 @@ return isLegalMaskedExpandLoad(DataTy); } +bool X86TTIImpl::shouldUseMaskedGatherForVectorization(Type *DataType, + bool VariableMask, + Align Alignment) { + if (!VariableMask) + return true; + return isLegalMaskedGather(DataType, Alignment); +} +bool X86TTIImpl::shouldUseMaskedScatterForVectorization(Type *DataType, + bool VariableMask, + Align Alignment) { + if (!VariableMask) + return true; + return isLegalMaskedScatter(DataType, Alignment); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { // Some CPUs have better gather performance than others. // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1504,15 +1504,18 @@ /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool shouldUseGatherOrScatterForVectorization(Value *V) { + auto *I = cast(V); bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); - return (LI && TTI.isLegalMaskedGather(Ty, Align)) || - (SI && TTI.isLegalMaskedScatter(Ty, Align)); + return (LI && TTI.shouldUseMaskedGatherForVectorization( + Ty, Legal->isMaskRequired(I), Align)) || + (SI && TTI.shouldUseMaskedScatterForVectorization( + Ty, Legal->isMaskRequired(I), Align)); } /// Returns true if the target machine supports all of the reduction @@ -6342,7 +6345,8 @@ // optimization to non-pointer types. // if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) + !isAccessInterleaved(&I) && + !shouldUseGatherOrScatterForVectorization(&I)) continue; ElementTypesInLoop.insert(T); @@ -7454,7 +7458,7 @@ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + shouldUseGatherOrScatterForVectorization(&I)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7496,7 +7500,7 @@ } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + shouldUseGatherOrScatterForVectorization(&I) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 222 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 444 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 888 for VF 64 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 112 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 224 for VF 64 For instruction: %valB = load i16, i16* %inB, align 2 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %valB = load i16, i16* %inB, align 4 define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll @@ -17,30 +17,30 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll @@ -17,30 +17,30 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 12 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 12 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 12 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 24 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 48 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 72 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 ; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 24 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 72 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 ; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 442 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 884 for VF 64 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 96 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 224 for VF 64 For instruction: %valB = load i8, i8* %inB, align 1 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %valB = load i8, i8* %inB, align 4 define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 32 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 32 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 464 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 176 for VF 64 For instruction: store i16 %valB, i16* %out, align 2 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %valB, i16* %out define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll @@ -17,37 +17,37 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 22 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 44 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 11 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 22 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 44 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, i32* %out diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll @@ -17,37 +17,37 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 10 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 20 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 23 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 47 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 23 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 47 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 80 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 80 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 456 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 32 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 176 for VF 64 For instruction: store i8 %valB, i8* %out, align 1 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i8 %valB, i8* %out define void @test() { diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -17,6 +18,144 @@ ; CHECK-NOT: x float> define void @_Z4testmm(i64 %size, i64 %offset) { +; CHECK-LABEL: @_Z4testmm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP5]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP6]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP10]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <4 x float> [[TMP14]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = fmul fast <4 x float> [[TMP18]], [[WIDE_LOAD5]] +; CHECK-NEXT: [[TMP23]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = add <4 x i64> [[TMP5]], +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP24]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP25]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER6]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP28]] +; CHECK-NEXT: [[TMP30]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = add <4 x i64> [[TMP5]], +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, <4 x i64> [[TMP31]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP32]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER7]] +; CHECK-NEXT: [[TMP34:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP35]] +; CHECK-NEXT: [[TMP37]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP36]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP39:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]]) +; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP30]]) +; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP23]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V_055:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX9]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET]] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]] +; CHECK-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP43:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP42]], [[TMP43]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP44:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP44]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP45:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP45]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP46]] +; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]] +; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]] +; CHECK-NEXT: [[TMP47:%.*]] = load float, float* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP43]], [[TMP47]] +; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP44]], [[MUL13]] +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP45]], [[MUL15]] +; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP46]], [[MUL17]] +; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]] +; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]] +; CHECK-NEXT: [[TMP48:%.*]] = load float, float* [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP43]], [[TMP48]] +; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP44]], [[MUL23]] +; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP45]], [[MUL25]] +; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP46]], [[MUL27]] +; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]] +; CHECK-NEXT: [[INC]] = add i64 [[V_055]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8 +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: store i8 [[R_0_LCSSA]], i8* @r_, align 1 +; CHECK-NEXT: store i8 [[G_0_LCSSA]], i8* @g_, align 1 +; CHECK-NEXT: store i8 [[B_0_LCSSA]], i8* @b_, align 1 +; CHECK-NEXT: ret void +; entry: %cmp53 = icmp eq i64 %size, 0 br i1 %cmp53, label %for.end, label %for.body.lr.ph diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -2,7 +2,7 @@ ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=SSE ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=sandybridge < %s | FileCheck %s --check-prefix=AVX1 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=haswell < %s | FileCheck %s --check-prefix=AVX2 -; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SSE +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLM ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SSE define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) { @@ -33,19 +33,54 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1 -; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 -; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] -; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; AVX1-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 -; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; AVX1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX1-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX1-NEXT: [[TMP0:%.*]] = shl nsw <2 x i64> [[VEC_IND]], +; AVX1-NEXT: [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], +; AVX1-NEXT: [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], +; AVX1-NEXT: [[STEP_ADD1:%.*]] = shl <2 x i64> [[VEC_IND]], +; AVX1-NEXT: [[TMP2:%.*]] = add <2 x i64> [[STEP_ADD1]], +; AVX1-NEXT: [[STEP_ADD2:%.*]] = shl <2 x i64> [[VEC_IND]], +; AVX1-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STEP_ADD2]], +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <2 x i64> [[TMP0]] +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP1]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP2]] +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP3]] +; AVX1-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP4]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP5]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP6]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP7]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP0]], +; AVX1-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP1]], +; AVX1-NEXT: [[TMP10:%.*]] = or <2 x i64> [[TMP2]], +; AVX1-NEXT: [[TMP11:%.*]] = or <2 x i64> [[TMP3]], +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP8]] +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP9]] +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP10]] +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], <2 x i64> [[TMP11]] +; AVX1-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP12]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP13]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP14]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP15]], i32 4, <2 x i1> , <2 x i32> undef) +; AVX1-NEXT: [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]] +; AVX1-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER8]], [[WIDE_MASKED_GATHER4]] +; AVX1-NEXT: [[TMP18:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER9]], [[WIDE_MASKED_GATHER5]] +; AVX1-NEXT: [[TMP19:%.*]] = add nsw <2 x i32> [[WIDE_MASKED_GATHER10]], [[WIDE_MASKED_GATHER6]] +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* +; AVX1-NEXT: store <2 x i32> [[TMP16]], <2 x i32>* [[TMP21]], align 4 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 2 +; AVX1-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <2 x i32>* +; AVX1-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* [[TMP23]], align 4 +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 4 +; AVX1-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <2 x i32>* +; AVX1-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP25]], align 4 +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i64 6 +; AVX1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <2 x i32>* +; AVX1-NEXT: store <2 x i32> [[TMP19]], <2 x i32>* [[TMP27]], align 4 +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; AVX1-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; AVX1-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; AVX1-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: @@ -117,6 +152,37 @@ ; AVX2: for.body: ; AVX2-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; +; SLM-LABEL: @foo( +; SLM-NEXT: entry: +; SLM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SLM: vector.ph: +; SLM-NEXT: br label [[VECTOR_BODY:%.*]] +; SLM: vector.body: +; SLM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SLM-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SLM-NEXT: [[TMP0:%.*]] = shl nsw <4 x i64> [[VEC_IND]], +; SLM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i64> [[TMP0]] +; SLM-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> , <4 x i32> undef) +; SLM-NEXT: [[TMP2:%.*]] = or <4 x i64> [[TMP0]], +; SLM-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], <4 x i64> [[TMP2]] +; SLM-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef) +; SLM-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]] +; SLM-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; SLM-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4 +; SLM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; SLM-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; SLM-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; SLM-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SLM: middle.block: +; SLM-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; SLM: scalar.ph: +; SLM-NEXT: br label [[FOR_BODY:%.*]] +; SLM: for.cond.cleanup: +; SLM-NEXT: ret void +; SLM: for.body: +; SLM-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -160,10 +160,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -180,95 +184,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -287,7 +252,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -328,13 +293,25 @@ ; CHECK-NEXT: call void @init(i32* [[BASE]]) ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT11]], <4 x i32*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT14]], <4 x i32*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i32*> poison, i32* [[BASE]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT17]], <4 x i32*> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP100:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP101:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP102:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP103:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -351,111 +328,44 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP65:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> poison, i32 [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP65]], i32 1 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP66]], i32 2 -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP67]], i32 3 -; CHECK-NEXT: [[TMP72:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> poison, i32 [[TMP72]], i32 0 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP73]], i32 1 -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP74]], i32 2 -; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP75]], i32 3 -; CHECK-NEXT: [[TMP80:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 -; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 -; CHECK-NEXT: [[TMP88:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[BASE]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 -; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 -; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 -; CHECK-NEXT: [[TMP96:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP97:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP98:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP99:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP71]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP100]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP101]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] -; CHECK-NEXT: [[TMP102]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] -; CHECK-NEXT: [[TMP103]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT12]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT15]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER19:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT18]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP23:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_GATHER10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI20:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_GATHER13]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI21:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_GATHER16]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI22:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_GATHER19]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI20]] +; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI21]] +; CHECK-NEXT: [[TMP27]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI22]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP104:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP104]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]] -; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]] -; CHECK-NEXT: [[TMP105:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP25]], [[TMP24]] +; CHECK-NEXT: [[BIN_RDX23:%.*]] = add <4 x i32> [[TMP26]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX24:%.*]] = add <4 x i32> [[TMP27]], [[BIN_RDX23]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX24]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -473,7 +383,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -512,276 +422,241 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE39:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP136:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP139:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP0]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP1]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP2]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP3]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32* -; CHECK-NEXT: [[TMP68:%.*]] = load i32, i32* [[TMP67]], align 4 -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> poison, i32 [[TMP68]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to i32* +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] -; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32* -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[TMP74]], align 4 -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] -; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32* -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[TMP81]], align 4 -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] -; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP83]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP85]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] -; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32* -; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4 -; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] -; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP90]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 1 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] ; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32* -; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[TMP14]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16* [[TMP15]] to i32* +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP17]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] ; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP97]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 2 +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] ; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32* -; CHECK-NEXT: [[TMP103:%.*]] = load i32, i32* [[TMP102]], align 4 -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[TMP22]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i16* [[TMP23]] to i32* +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP25]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] ; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP104]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP26]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 3 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] ; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32* -; CHECK-NEXT: [[TMP110:%.*]] = load i32, i32* [[TMP109]], align 4 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2 +; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i16, i16* [[TMP30]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i16* [[TMP31]] to i32* +; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP33]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] ; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP111]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP34]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 0 +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] ; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32* -; CHECK-NEXT: [[TMP117:%.*]] = load i32, i32* [[TMP116]], align 4 -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3 +; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP38:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, i16* [[TMP38]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i16* [[TMP39]] to i32* +; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> poison, i32 [[TMP41]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] ; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP118]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP120]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP42]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 1 +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] ; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32* -; CHECK-NEXT: [[TMP124:%.*]] = load i32, i32* [[TMP123]], align 4 -; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> poison, i32 [[TMP124]], i32 0 +; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP46:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i16, i16* [[TMP46]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i16* [[TMP47]] to i32* +; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP49]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] ; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP126:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP125]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK-NEXT: [[TMP51:%.*]] = phi <4 x i32> [ [[TMP43]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP50]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 2 +; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] ; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32* -; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP54:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i16, i16* [[TMP54]], i64 [[TMP53]] +; CHECK-NEXT: [[TMP56:%.*]] = bitcast i16* [[TMP55]] to i32* +; CHECK-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP56]], align 4 +; CHECK-NEXT: [[TMP58:%.*]] = insertelement <4 x i32> [[TMP51]], i32 [[TMP57]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] ; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP132]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK-NEXT: [[TMP59:%.*]] = phi <4 x i32> [ [[TMP51]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP58]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 3 +; CHECK-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] ; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32* -; CHECK-NEXT: [[TMP138:%.*]] = load i32, i32* [[TMP137]], align 4 -; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2 +; CHECK-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP62:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i16, i16* [[TMP62]], i64 [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = bitcast i16* [[TMP63]] to i32* +; CHECK-NEXT: [[TMP65:%.*]] = load i32, i32* [[TMP64]], align 4 +; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i32> [[TMP59]], i32 [[TMP65]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] ; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP139]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP141]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK-NEXT: [[TMP67:%.*]] = phi <4 x i32> [ [[TMP59]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP66]], [[PRED_LOAD_IF22]] ] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 0 +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] ; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32* -; CHECK-NEXT: [[TMP145:%.*]] = load i32, i32* [[TMP144]], align 4 -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3 +; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i16, i16* [[TMP70]], i64 [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = bitcast i16* [[TMP71]] to i32* +; CHECK-NEXT: [[TMP73:%.*]] = load i32, i32* [[TMP72]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x i32> poison, i32 [[TMP73]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] ; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP146]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK-NEXT: [[TMP75:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP74]], [[PRED_LOAD_IF24]] ] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 1 +; CHECK-NEXT: br i1 [[TMP76]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] ; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32* -; CHECK-NEXT: [[TMP152:%.*]] = load i32, i32* [[TMP151]], align 4 -; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x i32> poison, i32 [[TMP152]], i32 0 +; CHECK-NEXT: [[TMP77:%.*]] = add i64 [[INDEX]], 9 +; CHECK-NEXT: [[TMP78:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i16, i16* [[TMP78]], i64 [[TMP77]] +; CHECK-NEXT: [[TMP80:%.*]] = bitcast i16* [[TMP79]] to i32* +; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP75]], i32 [[TMP81]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] ; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP154:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP153]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP155]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP75]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP82]], [[PRED_LOAD_IF26]] ] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 2 +; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] ; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32* -; CHECK-NEXT: [[TMP159:%.*]] = load i32, i32* [[TMP158]], align 4 -; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1 +; CHECK-NEXT: [[TMP85:%.*]] = add i64 [[INDEX]], 10 +; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP85]] +; CHECK-NEXT: [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32* +; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[TMP88]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP83]], i32 [[TMP89]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] ; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP160]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP162]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP83]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP90]], [[PRED_LOAD_IF28]] ] +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 3 +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32* -; CHECK-NEXT: [[TMP166:%.*]] = load i32, i32* [[TMP165]], align 4 -; CHECK-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2 +; CHECK-NEXT: [[TMP93:%.*]] = add i64 [[INDEX]], 11 +; CHECK-NEXT: [[TMP94:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i16, i16* [[TMP94]], i64 [[TMP93]] +; CHECK-NEXT: [[TMP96:%.*]] = bitcast i16* [[TMP95]] to i32* +; CHECK-NEXT: [[TMP97:%.*]] = load i32, i32* [[TMP96]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x i32> [[TMP91]], i32 [[TMP97]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP167]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP169]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK-NEXT: [[TMP99:%.*]] = phi <4 x i32> [ [[TMP91]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP98]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP100:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 0 +; CHECK-NEXT: br i1 [[TMP100]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32* -; CHECK-NEXT: [[TMP173:%.*]] = load i32, i32* [[TMP172]], align 4 -; CHECK-NEXT: [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3 +; CHECK-NEXT: [[TMP101:%.*]] = add i64 [[INDEX]], 12 +; CHECK-NEXT: [[TMP102:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP103:%.*]] = getelementptr inbounds i16, i16* [[TMP102]], i64 [[TMP101]] +; CHECK-NEXT: [[TMP104:%.*]] = bitcast i16* [[TMP103]] to i32* +; CHECK-NEXT: [[TMP105:%.*]] = load i32, i32* [[TMP104]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <4 x i32> poison, i32 [[TMP105]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP174]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP181]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP182]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP183]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP107:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP106]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 1 +; CHECK-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] +; CHECK: pred.load.if34: +; CHECK-NEXT: [[TMP109:%.*]] = add i64 [[INDEX]], 13 +; CHECK-NEXT: [[TMP110:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP111:%.*]] = getelementptr inbounds i16, i16* [[TMP110]], i64 [[TMP109]] +; CHECK-NEXT: [[TMP112:%.*]] = bitcast i16* [[TMP111]] to i32* +; CHECK-NEXT: [[TMP113:%.*]] = load i32, i32* [[TMP112]], align 4 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP107]], i32 [[TMP113]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE35]] +; CHECK: pred.load.continue35: +; CHECK-NEXT: [[TMP115:%.*]] = phi <4 x i32> [ [[TMP107]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP114]], [[PRED_LOAD_IF34]] ] +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 2 +; CHECK-NEXT: br i1 [[TMP116]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] +; CHECK: pred.load.if36: +; CHECK-NEXT: [[TMP117:%.*]] = add i64 [[INDEX]], 14 +; CHECK-NEXT: [[TMP118:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i16, i16* [[TMP118]], i64 [[TMP117]] +; CHECK-NEXT: [[TMP120:%.*]] = bitcast i16* [[TMP119]] to i32* +; CHECK-NEXT: [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP115]], i32 [[TMP121]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE37]] +; CHECK: pred.load.continue37: +; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP115]], [[PRED_LOAD_CONTINUE35]] ], [ [[TMP122]], [[PRED_LOAD_IF36]] ] +; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 3 +; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39]] +; CHECK: pred.load.if38: +; CHECK-NEXT: [[TMP125:%.*]] = add i64 [[INDEX]], 15 +; CHECK-NEXT: [[TMP126:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds i16, i16* [[TMP126]], i64 [[TMP125]] +; CHECK-NEXT: [[TMP128:%.*]] = bitcast i16* [[TMP127]] to i32* +; CHECK-NEXT: [[TMP129:%.*]] = load i32, i32* [[TMP128]], align 4 +; CHECK-NEXT: [[TMP130:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP129]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE39]] +; CHECK: pred.load.continue39: +; CHECK-NEXT: [[TMP131:%.*]] = phi <4 x i32> [ [[TMP123]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP130]], [[PRED_LOAD_IF38]] ] +; CHECK-NEXT: [[TMP132:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP133:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP134:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP135:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[TMP35]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI40:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[TMP67]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI41:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI42:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[TMP131]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP136]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP137]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI40]] +; CHECK-NEXT: [[TMP138]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI41]] +; CHECK-NEXT: [[TMP139]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP140:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP140]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP185:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP137]], [[TMP136]] +; CHECK-NEXT: [[BIN_RDX43:%.*]] = add <4 x i32> [[TMP138]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX44:%.*]] = add <4 x i32> [[TMP139]], [[BIN_RDX43]] +; CHECK-NEXT: [[TMP141:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX44]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP141]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -802,7 +677,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP141]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -850,10 +725,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 @@ -870,95 +749,56 @@ ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4 -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP20]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP26]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP28]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 8 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP30]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 12 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP32]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP40]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP40]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -977,7 +817,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN_N]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1022,10 +862,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1024, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 @@ -1043,95 +887,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1150,7 +955,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1244,245 +1049,210 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE39:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP104:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP105:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP106:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP107:%.*]], [[PRED_LOAD_CONTINUE39]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP0]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP1]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP2]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP3]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, i32* [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] -; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, i32* [[TMP70]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] -; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, i32* [[TMP75]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] -; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] -; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[TMP80]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] -; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] ; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, i32* [[TMP85]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP13]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] ; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF10]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 2 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] ; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP19]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] ; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP20]], [[PRED_LOAD_IF12]] ] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER]], i32 3 +; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] ; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP25]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] ; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP21]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP26]], [[PRED_LOAD_IF14]] ] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 0 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] ; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, i32* [[TMP100]], align 4 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] ; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP32]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 1 +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] ; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, i32* [[TMP105]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 10 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP36]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP37]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] ; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP38]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 2 +; CHECK-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] ; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, i32* [[TMP110]], align 4 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 +; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[OFFSET_IDX]], 12 +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP42]], align 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP43]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] ; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK-NEXT: [[TMP45:%.*]] = phi <4 x i32> [ [[TMP39]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP44]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER7]], i32 3 +; CHECK-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] ; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, i32* [[TMP115]], align 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 +; CHECK-NEXT: [[TMP47:%.*]] = add i64 [[OFFSET_IDX]], 14 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP47]] +; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <4 x i32> [[TMP45]], i32 [[TMP49]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] ; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK-NEXT: [[TMP51:%.*]] = phi <4 x i32> [ [[TMP45]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP50]], [[PRED_LOAD_IF22]] ] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 0 +; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] ; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, i32* [[TMP120]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 +; CHECK-NEXT: [[TMP53:%.*]] = add i64 [[OFFSET_IDX]], 16 +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP53]] +; CHECK-NEXT: [[TMP55:%.*]] = load i32, i32* [[TMP54]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i32> poison, i32 [[TMP55]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] ; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK-NEXT: [[TMP57:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP56]], [[PRED_LOAD_IF24]] ] +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 1 +; CHECK-NEXT: br i1 [[TMP58]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] ; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, i32* [[TMP125]], align 4 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 +; CHECK-NEXT: [[TMP59:%.*]] = add i64 [[OFFSET_IDX]], 18 +; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = load i32, i32* [[TMP60]], align 4 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i32> [[TMP57]], i32 [[TMP61]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] ; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK-NEXT: [[TMP63:%.*]] = phi <4 x i32> [ [[TMP57]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP62]], [[PRED_LOAD_IF26]] ] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 2 +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] ; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: [[TMP65:%.*]] = add i64 [[OFFSET_IDX]], 20 +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = load i32, i32* [[TMP66]], align 4 +; CHECK-NEXT: [[TMP68:%.*]] = insertelement <4 x i32> [[TMP63]], i32 [[TMP67]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] ; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK-NEXT: [[TMP69:%.*]] = phi <4 x i32> [ [[TMP63]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP68]], [[PRED_LOAD_IF28]] ] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER8]], i32 3 +; CHECK-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, i32* [[TMP135]], align 4 -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 +; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[OFFSET_IDX]], 22 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = load i32, i32* [[TMP72]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x i32> [[TMP69]], i32 [[TMP73]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK-NEXT: [[TMP75:%.*]] = phi <4 x i32> [ [[TMP69]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP74]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 0 +; CHECK-NEXT: br i1 [[TMP76]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, i32* [[TMP140]], align 4 -; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 +; CHECK-NEXT: [[TMP77:%.*]] = add i64 [[OFFSET_IDX]], 24 +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = load i32, i32* [[TMP78]], align 4 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i32> poison, i32 [[TMP79]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP151]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP81:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP80]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 1 +; CHECK-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] +; CHECK: pred.load.if34: +; CHECK-NEXT: [[TMP83:%.*]] = add i64 [[OFFSET_IDX]], 26 +; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP83]] +; CHECK-NEXT: [[TMP85:%.*]] = load i32, i32* [[TMP84]], align 4 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP81]], i32 [[TMP85]], i32 1 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE35]] +; CHECK: pred.load.continue35: +; CHECK-NEXT: [[TMP87:%.*]] = phi <4 x i32> [ [[TMP81]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP86]], [[PRED_LOAD_IF34]] ] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 2 +; CHECK-NEXT: br i1 [[TMP88]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] +; CHECK: pred.load.if36: +; CHECK-NEXT: [[TMP89:%.*]] = add i64 [[OFFSET_IDX]], 28 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP89]] +; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[TMP90]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP87]], i32 [[TMP91]], i32 2 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE37]] +; CHECK: pred.load.continue37: +; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP87]], [[PRED_LOAD_CONTINUE35]] ], [ [[TMP92]], [[PRED_LOAD_IF36]] ] +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[WIDE_MASKED_GATHER9]], i32 3 +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39]] +; CHECK: pred.load.if38: +; CHECK-NEXT: [[TMP95:%.*]] = add i64 [[OFFSET_IDX]], 30 +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP95]] +; CHECK-NEXT: [[TMP97:%.*]] = load i32, i32* [[TMP96]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP97]], i32 3 +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE39]] +; CHECK: pred.load.continue39: +; CHECK-NEXT: [[TMP99:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP98]], [[PRED_LOAD_IF38]] ] +; CHECK-NEXT: [[TMP100:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP101:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP102:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP103:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[TMP27]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI40:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[TMP51]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI41:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[TMP75]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI42:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[TMP99]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP104]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP105]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI40]] +; CHECK-NEXT: [[TMP106]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI41]] +; CHECK-NEXT: [[TMP107]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048 -; CHECK-NEXT: br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP108:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048 +; CHECK-NEXT: br i1 [[TMP108]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP153:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP105]], [[TMP104]] +; CHECK-NEXT: [[BIN_RDX43:%.*]] = add <4 x i32> [[TMP106]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX44:%.*]] = add <4 x i32> [[TMP107]], [[BIN_RDX43]] +; CHECK-NEXT: [[TMP109:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX44]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP109]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1501,7 +1271,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4093 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP109]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1541,10 +1311,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1561,95 +1335,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1668,7 +1403,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1708,10 +1443,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1728,95 +1467,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1835,7 +1535,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -1875,10 +1575,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1895,95 +1599,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2002,7 +1667,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2051,10 +1716,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 @@ -2071,95 +1740,56 @@ ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> poison, i1 [[TMP33]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i1> [[TMP39]], i1 [[TMP36]], i32 3 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> poison, i1 [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 1 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 2 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i1> [[TMP47]], i1 [[TMP44]], i32 3 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> poison, i1 [[TMP49]], i32 0 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 1 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 2 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x i1> [[TMP55]], i1 [[TMP52]], i32 3 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = load i1, i1* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> poison, i1 [[TMP57]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4 -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8 -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12 -; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], -; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP20]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 8 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 12 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP36:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP40]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP40]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2178,7 +1808,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], [[MIN]] ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2225,10 +1855,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2245,95 +1879,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2352,7 +1947,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2393,10 +1988,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2413,95 +2012,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2520,7 +2080,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: @@ -2571,10 +2131,14 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i64> [[STEP_ADD1]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2591,95 +2155,56 @@ ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP32:%.*]] = load i1, i1* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i1, i1* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = load i1, i1* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i1, i1* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i1> poison, i1 [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i1> [[TMP36]], i1 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i1> [[TMP37]], i1 [[TMP34]], i32 2 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i1> [[TMP38]], i1 [[TMP35]], i32 3 -; CHECK-NEXT: [[TMP40:%.*]] = load i1, i1* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i1, i1* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP42:%.*]] = load i1, i1* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP43:%.*]] = load i1, i1* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i1> poison, i1 [[TMP40]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x i1> [[TMP44]], i1 [[TMP41]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i1> [[TMP45]], i1 [[TMP42]], i32 2 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i1> [[TMP46]], i1 [[TMP43]], i32 3 -; CHECK-NEXT: [[TMP48:%.*]] = load i1, i1* [[TMP24]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i1, i1* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP50:%.*]] = load i1, i1* [[TMP26]], align 1 -; CHECK-NEXT: [[TMP51:%.*]] = load i1, i1* [[TMP27]], align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i1> poison, i1 [[TMP48]], i32 0 -; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x i1> [[TMP52]], i1 [[TMP49]], i32 1 -; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x i1> [[TMP53]], i1 [[TMP50]], i32 2 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x i1> [[TMP54]], i1 [[TMP51]], i32 3 -; CHECK-NEXT: [[TMP56:%.*]] = load i1, i1* [[TMP28]], align 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i1, i1* [[TMP29]], align 1 -; CHECK-NEXT: [[TMP58:%.*]] = load i1, i1* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP59:%.*]] = load i1, i1* [[TMP31]], align 1 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i1> poison, i1 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 -; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 -; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 -; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 -; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 -; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) -; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD1]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i1, i1* [[TEST_BASE]], <4 x i64> [[STEP_ADD2]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP16]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP17]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP18]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i1> @llvm.masked.gather.v4i1.v4p0i1(<4 x i1*> [[TMP19]], i32 1, <4 x i1> , <4 x i1> undef) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> poison) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> poison) +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> poison) +; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER7]], +; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER8]], +; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[WIDE_MASKED_GATHER9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI13:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER7]], <4 x i32> [[WIDE_MASKED_LOAD10]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI14:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER8]], <4 x i32> [[WIDE_MASKED_LOAD11]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI15:%.*]] = select <4 x i1> [[WIDE_MASKED_GATHER9]], <4 x i32> [[WIDE_MASKED_LOAD12]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI13]] +; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI14]] +; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI15]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD2]], +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] -; CHECK-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX16]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX17]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -2698,7 +2223,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll --- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -12,9 +13,29 @@ ; } ;} -;CHECK-LABEL: @loop( -;CHECK-NOT: <4 x i32> define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +; CHECK-LABEL: @loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]] +; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -42,9 +63,41 @@ ; The same loop with parallel loop metadata added to the loop branch ; and the memory instructions. -;CHECK-LABEL: @parallel_loop( -;CHECK: <4 x i32> define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +; CHECK-LABEL: @parallel_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD1]] to <4 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], <4 x i64> [[TMP4]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> [[WIDE_LOAD]], <4 x i32*> [[TMP5]], i32 4, <4 x i1> ), !llvm.access.group !1 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD2]], <4 x i32>* [[TMP9]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -74,10 +127,30 @@ ; The same loop with an illegal parallel loop metadata: the memory ; accesses refer to a different loop's identifier. -;CHECK-LABEL: @mixed_metadata( -;CHECK-NOT: <4 x i32> define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +; CHECK-LABEL: @mixed_metadata( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !llvm.access.group !7 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7 +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]] +; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4, !llvm.access.group !8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !llvm.access.group !7 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4, !llvm.access.group !7 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -18,73 +18,49 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, [[TBAA1:!tbaa !.*]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP12]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA1]] -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP21]], i32 2 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP22]], i32 3 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP23]], i32 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP25]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP26]], i32 7 -; CHECK-NEXT: [[TMP35:%.*]] = mul nsw <8 x i32> [[TMP34]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP36:%.*]] = add <8 x i32> [[VEC_PHI]], -; CHECK-NEXT: [[TMP37]] = add <8 x i32> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96 -; CHECK-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA1:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], <4 x i64> [[VEC_IND]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA1]] +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], +; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP37]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 96 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ADD7_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, [[TBAA1]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, [[TBAA1]] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP40]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SUM_015]], 4 ; CHECK-NEXT: [[ADD7]] = add i32 [[ADD]], [[MUL]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; entry: %idxprom = sext i32 %i to i64 diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -480,8 +480,8 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 @@ -493,62 +493,34 @@ ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[TMP11]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[TMP13]], align 1 -; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[TMP15]], align 1 -; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP17]], align 1 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i8> poison, i8 [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i8> [[TMP30]], i8 [[TMP27]], i32 1 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i8> [[TMP31]], i8 [[TMP28]], i32 2 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i8> [[TMP32]], i8 [[TMP29]], i32 3 -; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[TMP19]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP21]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* [[TMP23]], align 1 -; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[TMP25]], align 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i8> poison, i8 [[TMP34]], i32 0 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i8> [[TMP38]], i8 [[TMP35]], i32 1 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i8> [[TMP39]], i8 [[TMP36]], i32 2 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i8> [[TMP40]], i8 [[TMP37]], i32 3 -; CHECK-NEXT: [[TMP42:%.*]] = urem <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP43:%.*]] = urem <4 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP44:%.*]] = trunc <4 x i64> [[TMP42]] to <4 x i8> -; CHECK-NEXT: [[TMP45:%.*]] = trunc <4 x i64> [[TMP43]] to <4 x i8> -; CHECK-NEXT: [[TMP46:%.*]] = lshr <4 x i8> [[TMP33]], [[TMP44]] -; CHECK-NEXT: [[TMP47:%.*]] = lshr <4 x i8> [[TMP41]], [[TMP45]] -; CHECK-NEXT: [[TMP48:%.*]] = and <4 x i8> [[TMP46]], -; CHECK-NEXT: [[TMP49:%.*]] = and <4 x i8> [[TMP47]], -; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP48]] to <4 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32> -; CHECK-NEXT: [[TMP52]] = add <4 x i32> [[VEC_PHI]], [[TMP50]] -; CHECK-NEXT: [[TMP53]] = add <4 x i32> [[VEC_PHI2]], [[TMP51]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE:%.*]], <4 x i64> [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TEST_BASE]], <4 x i64> [[TMP9]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP10]], i32 1, <4 x i1> , <4 x i8> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP11]], i32 1, <4 x i1> , <4 x i8> undef) +; CHECK-NEXT: [[TMP12:%.*]] = urem <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = urem <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8> +; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = lshr <4 x i8> [[WIDE_MASKED_GATHER3]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = and <4 x i8> [[TMP16]], +; CHECK-NEXT: [[TMP19:%.*]] = and <4 x i8> [[TMP17]], +; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[TMP18]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI]], [[TMP20]] +; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI2]], [[TMP21]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]] -; CHECK-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -566,7 +538,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP55]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -basic-aa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -16,7 +17,7 @@ ; consecutive vector of pointers store, therefore we should count it towards the ; widest vector count. ; -; CHECK: test_consecutive_store +; CHECK: LAA: Found a loop in test_consecutive_store ; CHECK: The Smallest and Widest types: 64 / 64 bits. define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { %4 = load %0*, %0** %2, align 8 @@ -50,8 +51,8 @@ ; for (int i = 0; i < 1024; ++i) { ; p[i][y] = (int*) (1 + q[i]); ; } -; CHECK: test_nonconsecutive_store -; CHECK: The Smallest and Widest types: 16 / 16 bits. +; CHECK: LAA: Found a loop in test_nonconsecutive_store +; CHECK: The Smallest and Widest types: 16 / 64 bits. define void @test_nonconsecutive_store() nounwind ssp uwtable { br label %1 @@ -92,7 +93,7 @@ ;; Now we check the same rules for loads. We should take consecutive loads of ;; pointer types into account. -; CHECK: test_consecutive_ptr_load +; CHECK: LAA: Found a loop in test_consecutive_ptr_load ; CHECK: The Smallest and Widest types: 8 / 64 bits. define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { br label %1 @@ -116,8 +117,8 @@ } ;; However, we should not take unconsecutive loads of pointers into account. -; CHECK: test_nonconsecutive_ptr_load -; CHECK: LV: The Smallest and Widest types: 16 / 16 bits. +; CHECK: LAA: Found a loop in test_nonconsecutive_ptr_load +; CHECK: The Smallest and Widest types: 16 / 64 bits. define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -805,45 +805,15 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP3]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP5]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP7]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP13]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP2]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP4]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP6]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP8]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP10]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP12]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP14]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[TMP17]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP18]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[TMP19]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP20]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[TMP21]], i32 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[TMP22]], i32 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[TMP23]], i32 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP24]], i32 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <8 x i8>* -; DISABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[TMP32]], <8 x i8>* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], <8 x i32> [[TMP0]] +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> [[TMP1]], i32 1, <8 x i1> , <8 x i8> undef) +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[WIDE_MASKED_GATHER]], <8 x i8>* [[TMP3]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP35]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void ; @@ -852,17 +822,17 @@ ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> , <16 x i8> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], <8 x i32> [[TMP0]] +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> [[TMP1]], i32 1, <8 x i1> , <8 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[WIDE_MASKED_GATHER]], <8 x i8>* [[TMP3]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -29,70 +29,43 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP5]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP7]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP4]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP12]], i16* [[TMP6]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP13]], i16* [[TMP8]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP14]], i16* [[TMP10]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = bitcast i16* [[TMP15]] to <4 x i16>* -; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = or <4 x i64> [[TMP2]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP18]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP20]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP22]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 0 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], i16* [[TMP19]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 1 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP27]], i16* [[TMP21]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 2 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP28]], i16* [[TMP23]], align 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3 -; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP29]], i16* [[TMP25]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], <4 x i64> [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[TMP3]], i32 2, <4 x i1> ) +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP2]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], <4 x i64> [[TMP6]] +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD1]], <4 x i16*> [[TMP7]], i32 2, <4 x i1> ) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP30]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void ; ; ENABLED_MASKED_STRIDED-LABEL: @test1( ; ENABLED_MASKED_STRIDED-NEXT: entry: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[INDEX]], 2 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], <4 x i64> [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[TMP3]], i32 2, <4 x i1> ) ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <16 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP8]], i32 2, <16 x i1> ) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or <4 x i64> [[TMP2]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], <4 x i64> [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD1]], <4 x i16*> [[TMP7]], i32 2, <4 x i1> ) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -5,12 +6,22 @@ @x = common global [1024 x x86_fp80] zeroinitializer, align 16 -;CHECK-LABEL: @example( -;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>* -;CHECK: store -;CHECK: ret void - define void @example() nounwind ssp uwtable { +; CHECK-LABEL: @example( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, <2 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v2f80.v2p0f80(<2 x x86_fp80> , <2 x x86_fp80*> [[TMP0]], i32 16, <2 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body