diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1242,6 +1242,11 @@ /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; + /// \returns Whether the LV should set the cost of + /// predicated scalarized memory operations to be artificially large, + /// to effectively prevent their vectorization. + bool useEmulatedMaskMemRefHack() const; + /// \returns The cost of the address computation. For most targets this can be /// merged into the instruction indexing mode. Some targets might want to /// distinguish between address computation for memory operations on vector @@ -1712,6 +1717,7 @@ ArrayRef Tys, TTI::TargetCostKind CostKind) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; + virtual bool useEmulatedMaskMemRefHack() = 0; virtual InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0; virtual InstructionCost @@ -2258,6 +2264,9 @@ unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } + bool useEmulatedMaskMemRefHack() override { + return Impl.useEmulatedMaskMemRefHack(); + } InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) override { return Impl.getAddressComputationCost(Ty, SE, Ptr); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -641,6 +641,8 @@ // Assume that we have a register of the right size for the type. unsigned getNumberOfParts(Type *Tp) const { return 1; } + bool useEmulatedMaskMemRefHack() const { return false; } + InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *, const SCEV *) const { return 0; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2029,6 +2029,10 @@ return LT.first.isValid() ? *LT.first.getValue() : 0; } + bool useEmulatedMaskMemRefHack() { + return true; // FIXME + } + InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *) { return 0; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -903,6 +903,10 @@ return TTIImpl->getNumberOfParts(Tp); } +bool TargetTransformInfo::useEmulatedMaskMemRefHack() const { + return TTIImpl->useEmulatedMaskMemRefHack(); +} + InstructionCost TargetTransformInfo::getAddressComputationCost(Type *Tp, ScalarEvolution *SE, const SCEV *Ptr) const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -161,6 +161,7 @@ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I); + bool useEmulatedMaskMemRefHack(); InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4059,6 +4059,8 @@ return Cost + LT.first; } +bool X86TTIImpl::useEmulatedMaskMemRefHack() { return !ST->hasAVX2(); } + InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6483,6 +6483,8 @@ // Limited number of Masked Store/Scatter emulation was allowed. assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); + if (!TTI.useEmulatedMaskMemRefHack()) + return false; return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll @@ -36,11 +36,11 @@ ; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 9 for VF 8 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 @@ -50,8 +50,8 @@ ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 -; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %valB.loaded = load i32, i32* %inB, align 4 +; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, i32* %inB, align 4 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, i32* %inB, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll @@ -36,11 +36,11 @@ ; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 5 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 @@ -50,8 +50,8 @@ ; AVX2-FASTGATHER: LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 -; AVX512: LV: Found an estimated cost of 24 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %valB.loaded = load i64, i64* %inB, align 8 +; AVX512: LV: Found an estimated cost of 12 for VF 4 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, i64* %inB, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, i64* %inB, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll @@ -89,30 +89,30 @@ ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2" ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 8 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction: %i2 = load i16, i16* %arrayidx2, align 2 @@ -164,17 +164,17 @@ ; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test" ; ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test" ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 7 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 16 For instruction: %i4 = load i16, i16* %arrayidx6, align 2 define void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) { diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll @@ -89,17 +89,17 @@ ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, i16* %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 23 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 50 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2" ; @@ -107,16 +107,16 @@ ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %2, i16* %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction: store i16 %0, i16* %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 27 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 20 for VF 16 For instruction: store i16 %2, i16* %arrayidx7, align 2 define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) { entry: diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll --- a/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-load-i16.ll @@ -35,18 +35,18 @@ ; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i16, i16* %inB, align 2 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 17 for VF 16 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 34 for VF 32 For instruction: %valB.loaded = load i16, i16* %inB, align 2 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 17 for VF 16 For instruction: %valB.loaded = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 34 for VF 32 For instruction: %valB.loaded = load i16, i16* %inB, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, i16* %inB, align 2 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i16, i16* %inB, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll --- a/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-load-i8.ll @@ -35,18 +35,18 @@ ; AVX1: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i8, i8* %inB, align 1 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 16 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 33 for VF 32 For instruction: %valB.loaded = load i8, i8* %inB, align 1 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 4 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 8 for VF 8 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 16 For instruction: %valB.loaded = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 33 for VF 32 For instruction: %valB.loaded = load i8, i8* %inB, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, i8* %inB, align 1 ; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i8, i8* %inB, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -95,7 +95,7 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE27:%.*]] ] ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 @@ -128,33 +128,105 @@ ; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64> ; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64> ; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64> -; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]] -; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]] -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]] -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]] -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) -; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], -; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], -; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] -; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) -; FVW2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2 -; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]]) -; FVW2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4 -; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]]) -; FVW2-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6 -; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* -; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) +; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP8]], i64 0 +; FVW2-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP20]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], i64 [[TMP25]] +; FVW2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP29:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP28]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x i1> [[TMP8]], i64 1 +; FVW2-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; FVW2: pred.load.if14: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i64> [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP31]] +; FVW2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4 +; FVW2-NEXT: [[TMP34:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP33]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; FVW2: pred.load.continue15: +; FVW2-NEXT: [[TMP35:%.*]] = phi <2 x float> [ [[TMP29]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP34]], [[PRED_LOAD_IF14]] ] +; FVW2-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0 +; FVW2-NEXT: br i1 [[TMP36]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; FVW2: pred.load.if16: +; FVW2-NEXT: [[TMP37:%.*]] = extractelement <2 x i64> [[TMP21]], i64 0 +; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP37]] +; FVW2-NEXT: [[TMP39:%.*]] = load float, float* [[TMP38]], align 4 +; FVW2-NEXT: [[TMP40:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; FVW2: pred.load.continue17: +; FVW2-NEXT: [[TMP41:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP40]], [[PRED_LOAD_IF16]] ] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x i1> [[TMP9]], i64 1 +; FVW2-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; FVW2: pred.load.if18: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i64> [[TMP21]], i64 1 +; FVW2-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP43]] +; FVW2-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4 +; FVW2-NEXT: [[TMP46:%.*]] = insertelement <2 x float> [[TMP41]], float [[TMP45]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; FVW2: pred.load.continue19: +; FVW2-NEXT: [[TMP47:%.*]] = phi <2 x float> [ [[TMP41]], [[PRED_LOAD_CONTINUE17]] ], [ [[TMP46]], [[PRED_LOAD_IF18]] ] +; FVW2-NEXT: [[TMP48:%.*]] = extractelement <2 x i1> [[TMP10]], i64 0 +; FVW2-NEXT: br i1 [[TMP48]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; FVW2: pred.load.if20: +; FVW2-NEXT: [[TMP49:%.*]] = extractelement <2 x i64> [[TMP22]], i64 0 +; FVW2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP49]] +; FVW2-NEXT: [[TMP51:%.*]] = load float, float* [[TMP50]], align 4 +; FVW2-NEXT: [[TMP52:%.*]] = insertelement <2 x float> poison, float [[TMP51]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; FVW2: pred.load.continue21: +; FVW2-NEXT: [[TMP53:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE19]] ], [ [[TMP52]], [[PRED_LOAD_IF20]] ] +; FVW2-NEXT: [[TMP54:%.*]] = extractelement <2 x i1> [[TMP10]], i64 1 +; FVW2-NEXT: br i1 [[TMP54]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; FVW2: pred.load.if22: +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i64> [[TMP22]], i64 1 +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP55]] +; FVW2-NEXT: [[TMP57:%.*]] = load float, float* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> [[TMP53]], float [[TMP57]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE23]] +; FVW2: pred.load.continue23: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ [[TMP53]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP58]], [[PRED_LOAD_IF22]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; FVW2: pred.load.if24: +; FVW2-NEXT: [[TMP61:%.*]] = extractelement <2 x i64> [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP61]] +; FVW2-NEXT: [[TMP63:%.*]] = load float, float* [[TMP62]], align 4 +; FVW2-NEXT: [[TMP64:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE25]] +; FVW2: pred.load.continue25: +; FVW2-NEXT: [[TMP65:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE23]] ], [ [[TMP64]], [[PRED_LOAD_IF24]] ] +; FVW2-NEXT: [[TMP66:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 +; FVW2-NEXT: br i1 [[TMP66]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27]] +; FVW2: pred.load.if26: +; FVW2-NEXT: [[TMP67:%.*]] = extractelement <2 x i64> [[TMP23]], i64 1 +; FVW2-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[TMP67]] +; FVW2-NEXT: [[TMP69:%.*]] = load float, float* [[TMP68]], align 4 +; FVW2-NEXT: [[TMP70:%.*]] = insertelement <2 x float> [[TMP65]], float [[TMP69]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE27]] +; FVW2: pred.load.continue27: +; FVW2-NEXT: [[TMP71:%.*]] = phi <2 x float> [ [[TMP65]], [[PRED_LOAD_CONTINUE25]] ], [ [[TMP70]], [[PRED_LOAD_IF26]] ] +; FVW2-NEXT: [[TMP72:%.*]] = fadd <2 x float> [[TMP35]], +; FVW2-NEXT: [[TMP73:%.*]] = fadd <2 x float> [[TMP47]], +; FVW2-NEXT: [[TMP74:%.*]] = fadd <2 x float> [[TMP59]], +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP71]], +; FVW2-NEXT: [[TMP76:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX7]] +; FVW2-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP72]], <2 x float>* [[TMP77]], i32 4, <2 x i1> [[TMP8]]) +; FVW2-NEXT: [[TMP78:%.*]] = getelementptr float, float* [[TMP76]], i64 2 +; FVW2-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP73]], <2 x float>* [[TMP79]], i32 4, <2 x i1> [[TMP9]]) +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr float, float* [[TMP76]], i64 4 +; FVW2-NEXT: [[TMP81:%.*]] = bitcast float* [[TMP80]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP74]], <2 x float>* [[TMP81]], i32 4, <2 x i1> [[TMP10]]) +; FVW2-NEXT: [[TMP82:%.*]] = getelementptr float, float* [[TMP76]], i64 6 +; FVW2-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP75]], <2 x float>* [[TMP83]], i32 4, <2 x i1> [[TMP11]]) ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 -; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FVW2-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; FVW2-NEXT: br i1 [[TMP84]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -365,40 +437,186 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 -; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 -; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0 +; FVW2-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4 +; FVW2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +; FVW2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1 +; FVW2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0 +; FVW2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1 +; FVW2-NEXT: [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer +; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer +; FVW2-NEXT: [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; FVW2-NEXT: [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4 +; FVW2-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4 +; FVW2-NEXT: [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; FVW2: pred.load.if10: +; FVW2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP47:%.*]] = load float, float* [[TMP46]], align 4 +; FVW2-NEXT: [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; FVW2: pred.load.continue11: +; FVW2-NEXT: [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ] +; FVW2-NEXT: [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; FVW2: pred.load.if12: +; FVW2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP52:%.*]] = load float, float* [[TMP51]], align 4 +; FVW2-NEXT: [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; FVW2: pred.load.continue13: +; FVW2-NEXT: [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ] +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; FVW2: pred.load.if14: +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP57:%.*]] = load float, float* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; FVW2: pred.load.continue15: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; FVW2: pred.load.if16: +; FVW2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP61]], align 4 +; FVW2-NEXT: [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; FVW2: pred.load.continue17: +; FVW2-NEXT: [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ] +; FVW2-NEXT: [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; FVW2: pred.load.if18: +; FVW2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP67:%.*]] = load float, float* [[TMP66]], align 4 +; FVW2-NEXT: [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; FVW2: pred.load.continue19: +; FVW2-NEXT: [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ] +; FVW2-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; FVW2: pred.load.if20: +; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[TMP71]], align 4 +; FVW2-NEXT: [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; FVW2: pred.load.continue21: +; FVW2-NEXT: [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ] +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], +; FVW2-NEXT: [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], +; FVW2-NEXT: [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], +; FVW2-NEXT: [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], +; FVW2-NEXT: [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0 +; FVW2-NEXT: store float [[TMP81]], float* [[TMP80]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; FVW2-NEXT: [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1 +; FVW2-NEXT: store float [[TMP84]], float* [[TMP83]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0 +; FVW2-NEXT: store float [[TMP87]], float* [[TMP86]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1 +; FVW2-NEXT: store float [[TMP90]], float* [[TMP89]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0 +; FVW2-NEXT: store float [[TMP93]], float* [[TMP92]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; FVW2: pred.store.if30: +; FVW2-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1 +; FVW2-NEXT: store float [[TMP96]], float* [[TMP95]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE31]] +; FVW2: pred.store.continue31: +; FVW2-NEXT: [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; FVW2: pred.store.if32: +; FVW2-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0 +; FVW2-NEXT: store float [[TMP99]], float* [[TMP98]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE33]] +; FVW2: pred.store.continue33: +; FVW2-NEXT: [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.if34: +; FVW2-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1 +; FVW2-NEXT: store float [[TMP102]], float* [[TMP101]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.continue35: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 +; FVW2-NEXT: [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -610,40 +828,186 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE34:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 -; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 -; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0 +; FVW2-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4 +; FVW2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +; FVW2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1 +; FVW2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0 +; FVW2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1 +; FVW2-NEXT: [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer +; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer +; FVW2-NEXT: [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; FVW2-NEXT: [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4 +; FVW2-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; FVW2: pred.load.if7: +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4 +; FVW2-NEXT: [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; FVW2: pred.load.continue8: +; FVW2-NEXT: [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF7]] ] +; FVW2-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; FVW2: pred.load.if9: +; FVW2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP47:%.*]] = load float, float* [[TMP46]], align 4 +; FVW2-NEXT: [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; FVW2: pred.load.continue10: +; FVW2-NEXT: [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE8]] ], [ [[TMP48]], [[PRED_LOAD_IF9]] ] +; FVW2-NEXT: [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP50]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; FVW2: pred.load.if11: +; FVW2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP52:%.*]] = load float, float* [[TMP51]], align 4 +; FVW2-NEXT: [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; FVW2: pred.load.continue12: +; FVW2-NEXT: [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP53]], [[PRED_LOAD_IF11]] ] +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP55]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; FVW2: pred.load.if13: +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP57:%.*]] = load float, float* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; FVW2: pred.load.continue14: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP58]], [[PRED_LOAD_IF13]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; FVW2: pred.load.if15: +; FVW2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP61]], align 4 +; FVW2-NEXT: [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; FVW2: pred.load.continue16: +; FVW2-NEXT: [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP63]], [[PRED_LOAD_IF15]] ] +; FVW2-NEXT: [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; FVW2: pred.load.if17: +; FVW2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP67:%.*]] = load float, float* [[TMP66]], align 4 +; FVW2-NEXT: [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; FVW2: pred.load.continue18: +; FVW2-NEXT: [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE16]] ], [ [[TMP68]], [[PRED_LOAD_IF17]] ] +; FVW2-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; FVW2: pred.load.if19: +; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[TMP71]], align 4 +; FVW2-NEXT: [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; FVW2: pred.load.continue20: +; FVW2-NEXT: [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP73]], [[PRED_LOAD_IF19]] ] +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], +; FVW2-NEXT: [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], +; FVW2-NEXT: [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], +; FVW2-NEXT: [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], +; FVW2-NEXT: [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0 +; FVW2-NEXT: store float [[TMP81]], float* [[TMP80]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1 -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; FVW2-NEXT: [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP82]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP83:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1 +; FVW2-NEXT: store float [[TMP84]], float* [[TMP83]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP85]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0 +; FVW2-NEXT: store float [[TMP87]], float* [[TMP86]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP88]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP89:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1 +; FVW2-NEXT: store float [[TMP90]], float* [[TMP89]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP92:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0 +; FVW2-NEXT: store float [[TMP93]], float* [[TMP92]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP95:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1 +; FVW2-NEXT: store float [[TMP96]], float* [[TMP95]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP97]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; FVW2: pred.store.if31: +; FVW2-NEXT: [[TMP98:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0 +; FVW2-NEXT: store float [[TMP99]], float* [[TMP98]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE32]] +; FVW2: pred.store.continue32: +; FVW2-NEXT: [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP100]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34]] +; FVW2: pred.store.if33: +; FVW2-NEXT: [[TMP101:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1 +; FVW2-NEXT: store float [[TMP102]], float* [[TMP101]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE34]] +; FVW2: pred.store.continue34: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FVW2-NEXT: [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -841,40 +1205,186 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 -; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 -; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0 +; FVW2-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4 +; FVW2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +; FVW2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1 +; FVW2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0 +; FVW2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1 +; FVW2-NEXT: [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer +; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer +; FVW2-NEXT: [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; FVW2-NEXT: [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP37:%.*]] = load float, float addrspace(1)* [[TMP36]], align 4 +; FVW2-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = load float, float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; FVW2: pred.load.if10: +; FVW2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP47:%.*]] = load float, float addrspace(1)* [[TMP46]], align 4 +; FVW2-NEXT: [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; FVW2: pred.load.continue11: +; FVW2-NEXT: [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ] +; FVW2-NEXT: [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; FVW2: pred.load.if12: +; FVW2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP52:%.*]] = load float, float addrspace(1)* [[TMP51]], align 4 +; FVW2-NEXT: [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; FVW2: pred.load.continue13: +; FVW2-NEXT: [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ] +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; FVW2: pred.load.if14: +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP57:%.*]] = load float, float addrspace(1)* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; FVW2: pred.load.continue15: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; FVW2: pred.load.if16: +; FVW2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP62:%.*]] = load float, float addrspace(1)* [[TMP61]], align 4 +; FVW2-NEXT: [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; FVW2: pred.load.continue17: +; FVW2-NEXT: [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ] +; FVW2-NEXT: [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; FVW2: pred.load.if18: +; FVW2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP67:%.*]] = load float, float addrspace(1)* [[TMP66]], align 4 +; FVW2-NEXT: [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; FVW2: pred.load.continue19: +; FVW2-NEXT: [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ] +; FVW2-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; FVW2: pred.load.if20: +; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP72:%.*]] = load float, float addrspace(1)* [[TMP71]], align 4 +; FVW2-NEXT: [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; FVW2: pred.load.continue21: +; FVW2-NEXT: [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ] +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], +; FVW2-NEXT: [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], +; FVW2-NEXT: [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], +; FVW2-NEXT: [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], +; FVW2-NEXT: [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0 -; FVW2-NEXT: store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0 +; FVW2-NEXT: store float [[TMP81]], float addrspace(1)* [[TMP80]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1 -; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FVW2-NEXT: [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1 +; FVW2-NEXT: store float [[TMP84]], float addrspace(1)* [[TMP83]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0 +; FVW2-NEXT: store float [[TMP87]], float addrspace(1)* [[TMP86]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1 +; FVW2-NEXT: store float [[TMP90]], float addrspace(1)* [[TMP89]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0 +; FVW2-NEXT: store float [[TMP93]], float addrspace(1)* [[TMP92]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; FVW2: pred.store.if30: +; FVW2-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1 +; FVW2-NEXT: store float [[TMP96]], float addrspace(1)* [[TMP95]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE31]] +; FVW2: pred.store.continue31: +; FVW2-NEXT: [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; FVW2: pred.store.if32: +; FVW2-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0 +; FVW2-NEXT: store float [[TMP99]], float addrspace(1)* [[TMP98]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE33]] +; FVW2: pred.store.continue33: +; FVW2-NEXT: [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.if34: +; FVW2-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1 +; FVW2-NEXT: store float [[TMP102]], float addrspace(1)* [[TMP101]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.continue35: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 +; FVW2-NEXT: [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1072,40 +1582,186 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 -; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 -; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0 +; FVW2-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4 +; FVW2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +; FVW2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1 +; FVW2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0 +; FVW2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1 +; FVW2-NEXT: [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer +; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer +; FVW2-NEXT: [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; FVW2-NEXT: [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP37:%.*]] = load float, float addrspace(1)* [[TMP36]], align 4 +; FVW2-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = load float, float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; FVW2: pred.load.if10: +; FVW2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP47:%.*]] = load float, float addrspace(1)* [[TMP46]], align 4 +; FVW2-NEXT: [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; FVW2: pred.load.continue11: +; FVW2-NEXT: [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ] +; FVW2-NEXT: [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; FVW2: pred.load.if12: +; FVW2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP52:%.*]] = load float, float addrspace(1)* [[TMP51]], align 4 +; FVW2-NEXT: [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; FVW2: pred.load.continue13: +; FVW2-NEXT: [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ] +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; FVW2: pred.load.if14: +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP57:%.*]] = load float, float addrspace(1)* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; FVW2: pred.load.continue15: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; FVW2: pred.load.if16: +; FVW2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP62:%.*]] = load float, float addrspace(1)* [[TMP61]], align 4 +; FVW2-NEXT: [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; FVW2: pred.load.continue17: +; FVW2-NEXT: [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ] +; FVW2-NEXT: [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; FVW2: pred.load.if18: +; FVW2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP67:%.*]] = load float, float addrspace(1)* [[TMP66]], align 4 +; FVW2-NEXT: [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; FVW2: pred.load.continue19: +; FVW2-NEXT: [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ] +; FVW2-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; FVW2: pred.load.if20: +; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP72:%.*]] = load float, float addrspace(1)* [[TMP71]], align 4 +; FVW2-NEXT: [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; FVW2: pred.load.continue21: +; FVW2-NEXT: [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ] +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], +; FVW2-NEXT: [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], +; FVW2-NEXT: [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], +; FVW2-NEXT: [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], +; FVW2-NEXT: [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0 -; FVW2-NEXT: store float [[TMP12]], float* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0 +; FVW2-NEXT: store float [[TMP81]], float* [[TMP80]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1 -; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; FVW2-NEXT: [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1 +; FVW2-NEXT: store float [[TMP84]], float* [[TMP83]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0 +; FVW2-NEXT: store float [[TMP87]], float* [[TMP86]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1 +; FVW2-NEXT: store float [[TMP90]], float* [[TMP89]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0 +; FVW2-NEXT: store float [[TMP93]], float* [[TMP92]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; FVW2: pred.store.if30: +; FVW2-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1 +; FVW2-NEXT: store float [[TMP96]], float* [[TMP95]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE31]] +; FVW2: pred.store.continue31: +; FVW2-NEXT: [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; FVW2: pred.store.if32: +; FVW2-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0 +; FVW2-NEXT: store float [[TMP99]], float* [[TMP98]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE33]] +; FVW2: pred.store.continue33: +; FVW2-NEXT: [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.if34: +; FVW2-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1 +; FVW2-NEXT: store float [[TMP102]], float* [[TMP101]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.continue35: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 +; FVW2-NEXT: [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1303,40 +1959,186 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 -; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; FVW2-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 -; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 -; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[TMP6]], zeroinitializer -; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 -; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) -; FVW2-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i64 0 -; FVW2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2-NEXT: [[TMP1:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i64 0 +; FVW2-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP16]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP9]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP10]], align 4 +; FVW2-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +; FVW2-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP12]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP23]], i64 0 +; FVW2-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i64 1 +; FVW2-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP13]], align 4 +; FVW2-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP14]], align 4 +; FVW2-NEXT: [[TMP29:%.*]] = insertelement <2 x i32> poison, i32 [[TMP27]], i64 0 +; FVW2-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[TMP28]], i64 1 +; FVW2-NEXT: [[TMP31:%.*]] = icmp sgt <2 x i32> [[TMP18]], zeroinitializer +; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[TMP22]], zeroinitializer +; FVW2-NEXT: [[TMP33:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; FVW2-NEXT: [[TMP34:%.*]] = icmp sgt <2 x i32> [[TMP30]], zeroinitializer +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FVW2: pred.load.if: +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP37:%.*]] = load float, float* [[TMP36]], align 4 +; FVW2-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP37]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FVW2: pred.load.continue: +; FVW2-NEXT: [[TMP39:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP38]], [[PRED_LOAD_IF]] ] +; FVW2-NEXT: [[TMP40:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; FVW2: pred.load.if8: +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP0]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4 +; FVW2-NEXT: [[TMP43:%.*]] = insertelement <2 x float> [[TMP39]], float [[TMP42]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; FVW2: pred.load.continue9: +; FVW2-NEXT: [[TMP44:%.*]] = phi <2 x float> [ [[TMP39]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP43]], [[PRED_LOAD_IF8]] ] +; FVW2-NEXT: [[TMP45:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP45]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; FVW2: pred.load.if10: +; FVW2-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP1]], i32 1 +; FVW2-NEXT: [[TMP47:%.*]] = load float, float* [[TMP46]], align 4 +; FVW2-NEXT: [[TMP48:%.*]] = insertelement <2 x float> poison, float [[TMP47]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; FVW2: pred.load.continue11: +; FVW2-NEXT: [[TMP49:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP48]], [[PRED_LOAD_IF10]] ] +; FVW2-NEXT: [[TMP50:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP50]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; FVW2: pred.load.if12: +; FVW2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP2]], i32 1 +; FVW2-NEXT: [[TMP52:%.*]] = load float, float* [[TMP51]], align 4 +; FVW2-NEXT: [[TMP53:%.*]] = insertelement <2 x float> [[TMP49]], float [[TMP52]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; FVW2: pred.load.continue13: +; FVW2-NEXT: [[TMP54:%.*]] = phi <2 x float> [ [[TMP49]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP53]], [[PRED_LOAD_IF12]] ] +; FVW2-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP55]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; FVW2: pred.load.if14: +; FVW2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP3]], i32 1 +; FVW2-NEXT: [[TMP57:%.*]] = load float, float* [[TMP56]], align 4 +; FVW2-NEXT: [[TMP58:%.*]] = insertelement <2 x float> poison, float [[TMP57]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; FVW2: pred.load.continue15: +; FVW2-NEXT: [[TMP59:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE13]] ], [ [[TMP58]], [[PRED_LOAD_IF14]] ] +; FVW2-NEXT: [[TMP60:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP60]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; FVW2: pred.load.if16: +; FVW2-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP4]], i32 1 +; FVW2-NEXT: [[TMP62:%.*]] = load float, float* [[TMP61]], align 4 +; FVW2-NEXT: [[TMP63:%.*]] = insertelement <2 x float> [[TMP59]], float [[TMP62]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; FVW2: pred.load.continue17: +; FVW2-NEXT: [[TMP64:%.*]] = phi <2 x float> [ [[TMP59]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP63]], [[PRED_LOAD_IF16]] ] +; FVW2-NEXT: [[TMP65:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; FVW2: pred.load.if18: +; FVW2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP5]], i32 1 +; FVW2-NEXT: [[TMP67:%.*]] = load float, float* [[TMP66]], align 4 +; FVW2-NEXT: [[TMP68:%.*]] = insertelement <2 x float> poison, float [[TMP67]], i64 0 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; FVW2: pred.load.continue19: +; FVW2-NEXT: [[TMP69:%.*]] = phi <2 x float> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP68]], [[PRED_LOAD_IF18]] ] +; FVW2-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP70]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; FVW2: pred.load.if20: +; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[TMP6]], i32 1 +; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[TMP71]], align 4 +; FVW2-NEXT: [[TMP73:%.*]] = insertelement <2 x float> [[TMP69]], float [[TMP72]], i64 1 +; FVW2-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; FVW2: pred.load.continue21: +; FVW2-NEXT: [[TMP74:%.*]] = phi <2 x float> [ [[TMP69]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP73]], [[PRED_LOAD_IF20]] ] +; FVW2-NEXT: [[TMP75:%.*]] = fadd <2 x float> [[TMP44]], +; FVW2-NEXT: [[TMP76:%.*]] = fadd <2 x float> [[TMP54]], +; FVW2-NEXT: [[TMP77:%.*]] = fadd <2 x float> [[TMP64]], +; FVW2-NEXT: [[TMP78:%.*]] = fadd <2 x float> [[TMP74]], +; FVW2-NEXT: [[TMP79:%.*]] = extractelement <2 x i1> [[TMP31]], i64 0 +; FVW2-NEXT: br i1 [[TMP79]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; FVW2: pred.store.if: -; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] -; FVW2-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i64 0 -; FVW2-NEXT: store float [[TMP12]], float addrspace(1)* [[TMP11]], align 4 +; FVW2-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP81:%.*]] = extractelement <2 x float> [[TMP75]], i64 0 +; FVW2-NEXT: store float [[TMP81]], float addrspace(1)* [[TMP80]], align 4 ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: -; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i64 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.if8: -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] -; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i64 1 -; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] -; FVW2: pred.store.continue9: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 -; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FVW2-NEXT: [[TMP82:%.*]] = extractelement <2 x i1> [[TMP31]], i64 1 +; FVW2-NEXT: br i1 [[TMP82]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP83:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] +; FVW2-NEXT: [[TMP84:%.*]] = extractelement <2 x float> [[TMP75]], i64 1 +; FVW2-NEXT: store float [[TMP84]], float addrspace(1)* [[TMP83]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP85:%.*]] = extractelement <2 x i1> [[TMP32]], i64 0 +; FVW2-NEXT: br i1 [[TMP85]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP86:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]] +; FVW2-NEXT: [[TMP87:%.*]] = extractelement <2 x float> [[TMP76]], i64 0 +; FVW2-NEXT: store float [[TMP87]], float addrspace(1)* [[TMP86]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP88:%.*]] = extractelement <2 x i1> [[TMP32]], i64 1 +; FVW2-NEXT: br i1 [[TMP88]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP2]] +; FVW2-NEXT: [[TMP90:%.*]] = extractelement <2 x float> [[TMP76]], i64 1 +; FVW2-NEXT: store float [[TMP90]], float addrspace(1)* [[TMP89]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP91:%.*]] = extractelement <2 x i1> [[TMP33]], i64 0 +; FVW2-NEXT: br i1 [[TMP91]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP3]] +; FVW2-NEXT: [[TMP93:%.*]] = extractelement <2 x float> [[TMP77]], i64 0 +; FVW2-NEXT: store float [[TMP93]], float addrspace(1)* [[TMP92]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[TMP94:%.*]] = extractelement <2 x i1> [[TMP33]], i64 1 +; FVW2-NEXT: br i1 [[TMP94]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; FVW2: pred.store.if30: +; FVW2-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP96:%.*]] = extractelement <2 x float> [[TMP77]], i64 1 +; FVW2-NEXT: store float [[TMP96]], float addrspace(1)* [[TMP95]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE31]] +; FVW2: pred.store.continue31: +; FVW2-NEXT: [[TMP97:%.*]] = extractelement <2 x i1> [[TMP34]], i64 0 +; FVW2-NEXT: br i1 [[TMP97]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]] +; FVW2: pred.store.if32: +; FVW2-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP5]] +; FVW2-NEXT: [[TMP99:%.*]] = extractelement <2 x float> [[TMP78]], i64 0 +; FVW2-NEXT: store float [[TMP99]], float addrspace(1)* [[TMP98]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE33]] +; FVW2: pred.store.continue33: +; FVW2-NEXT: [[TMP100:%.*]] = extractelement <2 x i1> [[TMP34]], i64 1 +; FVW2-NEXT: br i1 [[TMP100]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.if34: +; FVW2-NEXT: [[TMP101:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP6]] +; FVW2-NEXT: [[TMP102:%.*]] = extractelement <2 x float> [[TMP78]], i64 1 +; FVW2-NEXT: store float [[TMP102]], float addrspace(1)* [[TMP101]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE35]] +; FVW2: pred.store.continue35: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 +; FVW2-NEXT: [[TMP103:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP103]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -1,98 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; This test verifies that the loop vectorizer will NOT vectorize loops that -; will produce a tail loop with the optimize for size or the minimize size -; attributes. This is a target-dependent version of the test. -; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s -; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF +; This test verifies that the loop vectorizer will NOT produce a tail +; loop with the optimize for size or the minimize size attributes. +; REQUIRES: asserts +; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -S | FileCheck %s -check-prefixes=CHECK,CHECK-PGSO +; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso -S | FileCheck %s -check-prefixes=CHECK,CHECK-PGSO +; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso=false -S | FileCheck %s -check-prefixes=CHECK,CHECK-NO-PGSO +; RUN: opt < %s -passes='require,loop-vectorize' -S | FileCheck %s -check-prefixes=CHECK,CHECK-PGSO +; RUN: opt < %s -passes='require,loop-vectorize' -pgso -S | FileCheck %s -check-prefixes=CHECK,CHECK-PGSO +; RUN: opt < %s -passes='require,loop-vectorize' -pgso=false -S | FileCheck %s -check-prefixes=CHECK,CHECK-NO-PGSO target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" +target triple = "x86_64-unknown-linux-gnu" @tab = common global [32 x i8] zeroinitializer, align 1 define i32 @foo_optsize() #0 { ; CHECK-LABEL: @foo_optsize( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; -; AUTOVF-LABEL: @foo_optsize( -; AUTOVF-NEXT: entry: -; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AUTOVF: vector.ph: -; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] -; AUTOVF: vector.body: -; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0 -; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer -; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], -; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], -; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]]) -; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] -; AUTOVF: middle.block: -; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] -; AUTOVF: for.body: -; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 -; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 -; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 -; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] -; AUTOVF: for.end: -; AUTOVF-NEXT: ret i32 0 -; - entry: br label %for.body @@ -116,87 +54,20 @@ define i32 @foo_minsize() #1 { ; CHECK-LABEL: @foo_minsize( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; -; AUTOVF-LABEL: @foo_minsize( -; AUTOVF-NEXT: entry: -; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AUTOVF: vector.ph: -; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] -; AUTOVF: vector.body: -; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0 -; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer -; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], -; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], -; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 -; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison) -; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP1]]) -; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] -; AUTOVF: middle.block: -; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] -; AUTOVF: for.body: -; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 -; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 -; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 -; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 -; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] -; AUTOVF: for.end: -; AUTOVF-NEXT: ret i32 0 -; - entry: br label %for.body @@ -217,221 +88,935 @@ attributes #1 = { minsize } - -; We can vectorize this one by refraining from versioning for stride==1. -define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { -; CHECK-LABEL: @scev4stride1( -; CHECK-NEXT: for.body.preheader: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[K:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <64 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 15 -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 17 -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 18 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 19 -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[INDEX]], 20 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[INDEX]], 21 -; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[INDEX]], 22 -; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[INDEX]], 23 -; CHECK-NEXT: [[TMP24:%.*]] = add i32 [[INDEX]], 24 -; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[INDEX]], 25 -; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[INDEX]], 26 -; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[INDEX]], 27 -; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[INDEX]], 28 -; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 29 -; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], 30 -; CHECK-NEXT: [[TMP31:%.*]] = add i32 [[INDEX]], 31 -; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[INDEX]], 33 -; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[INDEX]], 34 -; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[INDEX]], 35 -; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[INDEX]], 36 -; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[INDEX]], 37 -; CHECK-NEXT: [[TMP38:%.*]] = add i32 [[INDEX]], 38 -; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[INDEX]], 39 -; CHECK-NEXT: [[TMP40:%.*]] = add i32 [[INDEX]], 40 -; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 41 -; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[INDEX]], 42 -; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[INDEX]], 43 -; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[INDEX]], 44 -; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[INDEX]], 45 -; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[INDEX]], 46 -; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[INDEX]], 47 -; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[INDEX]], 48 -; CHECK-NEXT: [[TMP49:%.*]] = add i32 [[INDEX]], 49 -; CHECK-NEXT: [[TMP50:%.*]] = add i32 [[INDEX]], 50 -; CHECK-NEXT: [[TMP51:%.*]] = add i32 [[INDEX]], 51 -; CHECK-NEXT: [[TMP52:%.*]] = add i32 [[INDEX]], 52 -; CHECK-NEXT: [[TMP53:%.*]] = add i32 [[INDEX]], 53 -; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[INDEX]], 54 -; CHECK-NEXT: [[TMP55:%.*]] = add i32 [[INDEX]], 55 -; CHECK-NEXT: [[TMP56:%.*]] = add i32 [[INDEX]], 56 -; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[INDEX]], 57 -; CHECK-NEXT: [[TMP58:%.*]] = add i32 [[INDEX]], 58 -; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[INDEX]], 59 -; CHECK-NEXT: [[TMP60:%.*]] = add i32 [[INDEX]], 60 -; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[INDEX]], 61 -; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[INDEX]], 62 -; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[INDEX]], 63 -; CHECK-NEXT: [[TMP64:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP64]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP65]], i32 4, <64 x i1> , <64 x i32> undef) -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP66]], i32 0 -; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to <64 x i32>* -; CHECK-NEXT: store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP68]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP69:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]] -; CHECK-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]] -; CHECK-NEXT: store i32 [[TMP70]], i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: ret void +define i32 @foo_pgso() !prof !14 { +; CHECK-PGSO-LABEL: @foo_pgso( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-PGSO: for.body: +; CHECK-PGSO-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-PGSO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-PGSO-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-PGSO-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-PGSO-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-PGSO-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-PGSO-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; CHECK-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-PGSO: for.end: +; CHECK-PGSO-NEXT: ret i32 0 ; -; AUTOVF-LABEL: @scev4stride1( -; AUTOVF-NEXT: for.body.preheader: -; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; AUTOVF: vector.ph: -; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[K:%.*]], i32 0 -; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer -; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] -; AUTOVF: vector.body: -; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; AUTOVF-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 -; AUTOVF-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 -; AUTOVF-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 -; AUTOVF-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 4 -; AUTOVF-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 5 -; AUTOVF-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 6 -; AUTOVF-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 7 -; AUTOVF-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; AUTOVF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP8]] -; AUTOVF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef) -; AUTOVF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] -; AUTOVF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 -; AUTOVF-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* -; AUTOVF-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP12]], align 4 -; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; AUTOVF-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; AUTOVF-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; AUTOVF-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] -; AUTOVF: middle.block: -; AUTOVF-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 -; AUTOVF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] -; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] -; AUTOVF: for.body: -; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] -; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]] -; AUTOVF-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]] -; AUTOVF-NEXT: store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4 -; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 -; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 -; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] -; AUTOVF: for.end.loopexit: -; AUTOVF-NEXT: ret void +; CHECK-NO-PGSO-LABEL: @foo_pgso( +; CHECK-NO-PGSO-NEXT: iter.check: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-NO-PGSO: vector.main.loop.iter.check: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]] +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NO-PGSO-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1 +; CHECK-NO-PGSO-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 16 +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NO-PGSO-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD1]], zeroinitializer +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> , <16 x i8> +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> , <16 x i8> +; CHECK-NO-PGSO-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NO-PGSO-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* [[TMP12]], align 1 +; CHECK-NO-PGSO-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NO-PGSO-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 1 +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NO-PGSO-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 192 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: [[CMP_N:%.*]] = icmp eq i32 203, 192 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-NO-PGSO: vec.epilog.iter.check: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NO-PGSO: vec.epilog.ph: +; CHECK-NO-PGSO-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NO-PGSO-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vec.epilog.vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[TMP15:%.*]] = add i32 [[INDEX2]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP16:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP15]] +; CHECK-NO-PGSO-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP16]], i32 0 +; CHECK-NO-PGSO-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NO-PGSO-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; CHECK-NO-PGSO-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD5]], zeroinitializer +; CHECK-NO-PGSO-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP19]], <8 x i8> , <8 x i8> +; CHECK-NO-PGSO-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NO-PGSO-NEXT: store <8 x i8> [[TMP20]], <8 x i8>* [[TMP21]], align 1 +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT3]] = add nuw i32 [[INDEX2]], 8 +; CHECK-NO-PGSO-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 200 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NO-PGSO: vec.epilog.middle.block: +; CHECK-NO-PGSO-NEXT: [[CMP_N4:%.*]] = icmp eq i32 203, 200 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NO-PGSO: vec.epilog.scalar.ph: +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 200, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NO-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NO-PGSO: for.body: +; CHECK-NO-PGSO-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NO-PGSO-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NO-PGSO-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP23]], 0 +; CHECK-NO-PGSO-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NO-PGSO-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NO-PGSO-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NO-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; CHECK-NO-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NO-PGSO: for.end.loopexit: +; CHECK-NO-PGSO-NEXT: br label [[FOR_END]] +; CHECK-NO-PGSO: for.end: +; CHECK-NO-PGSO-NEXT: ret i32 0 ; -for.body.preheader: +entry: br label %for.body -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %mul = mul nsw i32 %i.07, %k - %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 - store i32 %0, i32* %arrayidx1, align 4 - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, 256 - br i1 %exitcond, label %for.end.loopexit, label %for.body +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body -for.end.loopexit: ; preds = %for.body +for.end: ; preds = %for.body + ret i32 0 +} + +; PR43371: don't run into an assert due to emitting SCEV runtime checks +; with OptForSize. +; +@cm_array = external global [2592 x i16], align 1 + +define void @pr43371() optsize { +; We do not want to generate SCEV predicates when optimising for size, because +; that will lead to extra code generation such as the SCEV overflow runtime +; checks. Not generating SCEV predicates can still result in vectorisation as +; the non-consecutive loads/stores can be scalarized: +; +; CHECK-LABEL: @pr43371( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY29:%.*]] +; CHECK: for.cond.cleanup28: +; CHECK-NEXT: unreachable +; CHECK: for.body29: +; CHECK-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; CHECK-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]] +; CHECK-NEXT: store i16 0, i16* [[ARRAYIDX35]], align 1 +; CHECK-NEXT: [[INC37]] = add i16 [[I24_0170]], 1 +; CHECK-NEXT: [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756 +; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28:%.*]] +; +entry: + br label %for.body29 + +for.cond.cleanup28: + unreachable + +for.body29: + %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] + %add33 = add i16 undef, %i24.0170 + %idxprom34 = zext i16 %add33 to i32 + %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 + store i16 0, i16 * %arrayidx35, align 1 + %inc37 = add i16 %i24.0170, 1 + %cmp26 = icmp ult i16 %inc37, 756 + br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 +} + +define void @pr43371_pgso() !prof !14 { +; We do not want to generate SCEV predicates when optimising for size, because +; that will lead to extra code generation such as the SCEV overflow runtime +; checks. Not generating SCEV predicates can still result in vectorisation as +; the non-consecutive loads/stores can be scalarized: +; +; +; +; CHECK-PGSO-LABEL: @pr43371_pgso( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: br label [[FOR_BODY29:%.*]] +; CHECK-PGSO: for.cond.cleanup28: +; CHECK-PGSO-NEXT: unreachable +; CHECK-PGSO: for.body29: +; CHECK-PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ] +; CHECK-PGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; CHECK-PGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 +; CHECK-PGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]] +; CHECK-PGSO-NEXT: store i16 0, i16* [[ARRAYIDX35]], align 1 +; CHECK-PGSO-NEXT: [[INC37]] = add i16 [[I24_0170]], 1 +; CHECK-PGSO-NEXT: [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756 +; CHECK-PGSO-NEXT: br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28:%.*]] +; +; CHECK-NO-PGSO-LABEL: @pr43371_pgso( +; CHECK-NO-PGSO-NEXT: entry: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NO-PGSO: vector.scevcheck: +; CHECK-NO-PGSO-NEXT: br i1 undef, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 8 +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = add i16 undef, [[TMP0]] +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = add i16 undef, [[TMP1]] +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NO-PGSO-NEXT: [[TMP6:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP4]] +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[TMP5]] +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[TMP6]], i32 0 +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <8 x i16>* +; CHECK-NO-PGSO-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[TMP9]], align 1 +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = getelementptr i16, i16* [[TMP6]], i32 8 +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <8 x i16>* +; CHECK-NO-PGSO-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[TMP11]], align 1 +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-NO-PGSO-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 752 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: [[CMP_N:%.*]] = icmp eq i32 756, 752 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP28:%.*]], label [[SCALAR_PH]] +; CHECK-NO-PGSO: scalar.ph: +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 752, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NO-PGSO-NEXT: br label [[FOR_BODY29:%.*]] +; CHECK-NO-PGSO: for.cond.cleanup28: +; CHECK-NO-PGSO-NEXT: unreachable +; CHECK-NO-PGSO: for.body29: +; CHECK-NO-PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC37:%.*]], [[FOR_BODY29]] ] +; CHECK-NO-PGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; CHECK-NO-PGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 +; CHECK-NO-PGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], [2592 x i16]* @cm_array, i32 0, i32 [[IDXPROM34]] +; CHECK-NO-PGSO-NEXT: store i16 0, i16* [[ARRAYIDX35]], align 1 +; CHECK-NO-PGSO-NEXT: [[INC37]] = add i16 [[I24_0170]], 1 +; CHECK-NO-PGSO-NEXT: [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP26]], label [[FOR_BODY29]], label [[FOR_COND_CLEANUP28]], !llvm.loop [[LOOP21:![0-9]+]] +; +entry: + br label %for.body29 + +for.cond.cleanup28: + unreachable + +for.body29: + %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] + %add33 = add i16 undef, %i24.0170 + %idxprom34 = zext i16 %add33 to i32 + %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 + store i16 0, i16 * %arrayidx35, align 1 + %inc37 = add i16 %i24.0170, 1 + %cmp26 = icmp ult i16 %inc37, 756 + br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 +} + +; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out. +; +define i32 @pr45526() optsize { +; CHECK-LABEL: @pr45526( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PIV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 5, [[ENTRY]] ], [ [[PIVPLUS1]], [[LOOP]] ] +; CHECK-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ] +; CHECK-NEXT: ret i32 [[FOR_LCSSA]] +; +entry: + br label %loop + +loop: + %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] + %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] + %pivPlus1 = add nuw nsw i32 %piv, 1 + %cond = icmp ult i32 %piv, 510 + br i1 %cond, label %loop, label %exit + +exit: + ret i32 %for +} + +define i32 @pr45526_pgso() !prof !14 { +; CHECK-PGSO-LABEL: @pr45526_pgso( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: br label [[LOOP:%.*]] +; CHECK-PGSO: loop: +; CHECK-PGSO-NEXT: [[PIV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-PGSO-NEXT: [[FOR:%.*]] = phi i32 [ 5, [[ENTRY]] ], [ [[PIVPLUS1]], [[LOOP]] ] +; CHECK-PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 +; CHECK-PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 +; CHECK-PGSO-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK-PGSO: exit: +; CHECK-PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ] +; CHECK-PGSO-NEXT: ret i32 [[FOR_LCSSA]] +; +; CHECK-NO-PGSO-LABEL: @pr45526_pgso( +; CHECK-NO-PGSO-NEXT: entry: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ , [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[STEP_ADD:%.*]] = add <16 x i32> [[VEC_IND]], +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 5 +; CHECK-NO-PGSO-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 6 +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 7 +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 8 +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], 9 +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 10 +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 11 +; CHECK-NO-PGSO-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 12 +; CHECK-NO-PGSO-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 13 +; CHECK-NO-PGSO-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 14 +; CHECK-NO-PGSO-NEXT: [[TMP15:%.*]] = add i32 [[INDEX]], 15 +; CHECK-NO-PGSO-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NO-PGSO-NEXT: [[TMP17:%.*]] = add i32 [[INDEX]], 17 +; CHECK-NO-PGSO-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 18 +; CHECK-NO-PGSO-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], 19 +; CHECK-NO-PGSO-NEXT: [[TMP20:%.*]] = add i32 [[INDEX]], 20 +; CHECK-NO-PGSO-NEXT: [[TMP21:%.*]] = add i32 [[INDEX]], 21 +; CHECK-NO-PGSO-NEXT: [[TMP22:%.*]] = add i32 [[INDEX]], 22 +; CHECK-NO-PGSO-NEXT: [[TMP23:%.*]] = add i32 [[INDEX]], 23 +; CHECK-NO-PGSO-NEXT: [[TMP24:%.*]] = add i32 [[INDEX]], 24 +; CHECK-NO-PGSO-NEXT: [[TMP25:%.*]] = add i32 [[INDEX]], 25 +; CHECK-NO-PGSO-NEXT: [[TMP26:%.*]] = add i32 [[INDEX]], 26 +; CHECK-NO-PGSO-NEXT: [[TMP27:%.*]] = add i32 [[INDEX]], 27 +; CHECK-NO-PGSO-NEXT: [[TMP28:%.*]] = add i32 [[INDEX]], 28 +; CHECK-NO-PGSO-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 29 +; CHECK-NO-PGSO-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], 30 +; CHECK-NO-PGSO-NEXT: [[TMP31:%.*]] = add i32 [[INDEX]], 31 +; CHECK-NO-PGSO-NEXT: [[TMP32:%.*]] = add nuw nsw <16 x i32> [[VEC_IND]], +; CHECK-NO-PGSO-NEXT: [[TMP33]] = add nuw nsw <16 x i32> [[STEP_ADD]], +; CHECK-NO-PGSO-NEXT: [[TMP34:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP32]], <16 x i32> +; CHECK-NO-PGSO-NEXT: [[TMP35:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP33]], <16 x i32> +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NO-PGSO-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[STEP_ADD]], +; CHECK-NO-PGSO-NEXT: [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], 480 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: [[CMP_N:%.*]] = icmp eq i32 511, 480 +; CHECK-NO-PGSO-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP33]], i32 15 +; CHECK-NO-PGSO-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP33]], i32 14 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NO-PGSO: scalar.ph: +; CHECK-NO-PGSO-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 5, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 480, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NO-PGSO-NEXT: br label [[LOOP:%.*]] +; CHECK-NO-PGSO: loop: +; CHECK-NO-PGSO-NEXT: [[PIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-NO-PGSO-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PIVPLUS1]], [[LOOP]] ] +; CHECK-NO-PGSO-NEXT: [[PIVPLUS1]] = add nuw nsw i32 [[PIV]], 1 +; CHECK-NO-PGSO-NEXT: [[COND:%.*]] = icmp ult i32 [[PIV]], 510 +; CHECK-NO-PGSO-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NO-PGSO: exit: +; CHECK-NO-PGSO-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] +; CHECK-NO-PGSO-NEXT: ret i32 [[FOR_LCSSA]] +; +entry: + br label %loop + +loop: + %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] + %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] + %pivPlus1 = add nuw nsw i32 %piv, 1 + %cond = icmp ult i32 %piv, 510 + br i1 %cond, label %loop, label %exit + +exit: + ret i32 %for +} + +; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled +; vectorization. + +; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py +define void @stride1(i16* noalias %B, i32 %BStride) optsize { +; CHECK-PGSO-LABEL: @stride1( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-PGSO: vector.ph: +; CHECK-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-PGSO: vector.body: +; CHECK-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] +; CHECK-PGSO-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] +; CHECK-PGSO-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], +; CHECK-PGSO-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; CHECK-PGSO-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-PGSO: pred.store.if: +; CHECK-PGSO-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-PGSO-NEXT: [[TMP3:%.*]] = mul nsw i32 [[TMP2]], [[BSTRIDE:%.*]] +; CHECK-PGSO-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP4]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-PGSO: pred.store.continue: +; CHECK-PGSO-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; CHECK-PGSO-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK-PGSO: pred.store.if1: +; CHECK-PGSO-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1 +; CHECK-PGSO-NEXT: [[TMP7:%.*]] = mul nsw i32 [[TMP6]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP7]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP8]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK-PGSO: pred.store.continue2: +; CHECK-PGSO-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; CHECK-PGSO-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-PGSO: pred.store.if3: +; CHECK-PGSO-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 2 +; CHECK-PGSO-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP10]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP11]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP12]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK-PGSO: pred.store.continue4: +; CHECK-PGSO-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; CHECK-PGSO-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-PGSO: pred.store.if5: +; CHECK-PGSO-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 3 +; CHECK-PGSO-NEXT: [[TMP15:%.*]] = mul nsw i32 [[TMP14]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP15]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP16]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK-PGSO: pred.store.continue6: +; CHECK-PGSO-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; CHECK-PGSO-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK-PGSO: pred.store.if7: +; CHECK-PGSO-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 4 +; CHECK-PGSO-NEXT: [[TMP19:%.*]] = mul nsw i32 [[TMP18]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP19]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP20]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK-PGSO: pred.store.continue8: +; CHECK-PGSO-NEXT: [[TMP21:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; CHECK-PGSO-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-PGSO: pred.store.if9: +; CHECK-PGSO-NEXT: [[TMP22:%.*]] = add i32 [[INDEX]], 5 +; CHECK-PGSO-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP23]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP24]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE10]] +; CHECK-PGSO: pred.store.continue10: +; CHECK-PGSO-NEXT: [[TMP25:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; CHECK-PGSO-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK-PGSO: pred.store.if11: +; CHECK-PGSO-NEXT: [[TMP26:%.*]] = add i32 [[INDEX]], 6 +; CHECK-PGSO-NEXT: [[TMP27:%.*]] = mul nsw i32 [[TMP26]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP27]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP28]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE12]] +; CHECK-PGSO: pred.store.continue12: +; CHECK-PGSO-NEXT: [[TMP29:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; CHECK-PGSO-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] +; CHECK-PGSO: pred.store.if13: +; CHECK-PGSO-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], 7 +; CHECK-PGSO-NEXT: [[TMP31:%.*]] = mul nsw i32 [[TMP30]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP31]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[TMP32]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE14]] +; CHECK-PGSO: pred.store.continue14: +; CHECK-PGSO-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-PGSO-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; CHECK-PGSO-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1032 +; CHECK-PGSO-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-PGSO: middle.block: +; CHECK-PGSO-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-PGSO: scalar.ph: +; CHECK-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1032, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-PGSO: for.body: +; CHECK-PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-PGSO-NEXT: [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[GEPOFB]], align 4 +; CHECK-PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 +; CHECK-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-PGSO: for.end: +; CHECK-PGSO-NEXT: ret void +; +; CHECK-NO-PGSO-LABEL: @stride1( +; CHECK-NO-PGSO-NEXT: entry: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ] +; CHECK-NO-PGSO-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE14]] ] +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NO-PGSO: pred.store.if: +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = mul nsw i32 [[TMP2]], [[BSTRIDE:%.*]] +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP4]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-NO-PGSO: pred.store.continue: +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK-NO-PGSO: pred.store.if1: +; CHECK-NO-PGSO-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = mul nsw i32 [[TMP6]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP7]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP8]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK-NO-PGSO: pred.store.continue2: +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NO-PGSO: pred.store.if3: +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP10]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP11]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP12]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK-NO-PGSO: pred.store.continue4: +; CHECK-NO-PGSO-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-NO-PGSO: pred.store.if5: +; CHECK-NO-PGSO-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NO-PGSO-NEXT: [[TMP15:%.*]] = mul nsw i32 [[TMP14]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP15]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP16]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK-NO-PGSO: pred.store.continue6: +; CHECK-NO-PGSO-NEXT: [[TMP17:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; CHECK-NO-PGSO: pred.store.if7: +; CHECK-NO-PGSO-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NO-PGSO-NEXT: [[TMP19:%.*]] = mul nsw i32 [[TMP18]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP19]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP20]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK-NO-PGSO: pred.store.continue8: +; CHECK-NO-PGSO-NEXT: [[TMP21:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; CHECK-NO-PGSO: pred.store.if9: +; CHECK-NO-PGSO-NEXT: [[TMP22:%.*]] = add i32 [[INDEX]], 5 +; CHECK-NO-PGSO-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP24:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP23]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP24]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE10]] +; CHECK-NO-PGSO: pred.store.continue10: +; CHECK-NO-PGSO-NEXT: [[TMP25:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; CHECK-NO-PGSO: pred.store.if11: +; CHECK-NO-PGSO-NEXT: [[TMP26:%.*]] = add i32 [[INDEX]], 6 +; CHECK-NO-PGSO-NEXT: [[TMP27:%.*]] = mul nsw i32 [[TMP26]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP28:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP27]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP28]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE12]] +; CHECK-NO-PGSO: pred.store.continue12: +; CHECK-NO-PGSO-NEXT: [[TMP29:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]] +; CHECK-NO-PGSO: pred.store.if13: +; CHECK-NO-PGSO-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], 7 +; CHECK-NO-PGSO-NEXT: [[TMP31:%.*]] = mul nsw i32 [[TMP30]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP31]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[TMP32]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE14]] +; CHECK-NO-PGSO: pred.store.continue14: +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NO-PGSO-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; CHECK-NO-PGSO-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1032 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NO-PGSO: scalar.ph: +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1032, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NO-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NO-PGSO: for.body: +; CHECK-NO-PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NO-PGSO-NEXT: [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[GEPOFB]], align 4 +; CHECK-NO-PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NO-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 +; CHECK-NO-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NO-PGSO: for.end: +; CHECK-NO-PGSO-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] + %mulB = mul nsw i32 %iv, %BStride + %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB + store i16 42, i16* %gepOfB, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, 1025 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15 + +for.end: ret void } -attributes #2 = { optsize } - - -; PR39497 -; We can't vectorize this one because we version for overflow check and tiny -; trip count leads to opt-for-size (which otherwise could fold the tail by -; masking). -define i32 @main() local_unnamed_addr { -; CHECK-LABEL: @main( -; CHECK-NEXT: while.cond: -; CHECK-NEXT: br label [[FOR_COND:%.*]] -; CHECK: for.cond: -; CHECK-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ] -; CHECK-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4 -; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]] -; CHECK: while.cond.loopexit: -; CHECK-NEXT: ret i32 0 +; Vectorize with versioning for unit stride for PGSO and enabled vectorization. ; -; AUTOVF-LABEL: @main( -; AUTOVF-NEXT: while.cond: -; AUTOVF-NEXT: br label [[FOR_COND:%.*]] -; AUTOVF: for.cond: -; AUTOVF-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ] -; AUTOVF-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535 -; AUTOVF-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4 -; AUTOVF-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1 -; AUTOVF-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]] -; AUTOVF: while.cond.loopexit: -; AUTOVF-NEXT: ret i32 0 +define void @stride1_pgso(i16* noalias %B, i32 %BStride) !prof !14 { +; CHECK-PGSO-LABEL: @stride1_pgso( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-PGSO: vector.scevcheck: +; CHECK-PGSO-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[BSTRIDE:%.*]], 1 +; CHECK-PGSO-NEXT: [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]] +; CHECK-PGSO-NEXT: br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-PGSO: vector.ph: +; CHECK-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-PGSO: vector.body: +; CHECK-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-PGSO-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-PGSO-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8 +; CHECK-PGSO-NEXT: [[TMP3:%.*]] = mul nsw i32 [[TMP1]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP4:%.*]] = mul nsw i32 [[TMP2]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] +; CHECK-PGSO-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP4]] +; CHECK-PGSO-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 +; CHECK-PGSO-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <8 x i16>* +; CHECK-PGSO-NEXT: store <8 x i16> , <8 x i16>* [[TMP8]], align 4 +; CHECK-PGSO-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 8 +; CHECK-PGSO-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* +; CHECK-PGSO-NEXT: store <8 x i16> , <8 x i16>* [[TMP10]], align 4 +; CHECK-PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-PGSO-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-PGSO-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-PGSO: middle.block: +; CHECK-PGSO-NEXT: [[CMP_N:%.*]] = icmp eq i32 1025, 1024 +; CHECK-PGSO-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-PGSO: scalar.ph: +; CHECK-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-PGSO: for.body: +; CHECK-PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-PGSO-NEXT: [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]] +; CHECK-PGSO-NEXT: [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]] +; CHECK-PGSO-NEXT: store i16 42, i16* [[GEPOFB]], align 4 +; CHECK-PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 +; CHECK-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-PGSO: for.end: +; CHECK-PGSO-NEXT: ret void ; -while.cond: - br label %for.cond +; CHECK-NO-PGSO-LABEL: @stride1_pgso( +; CHECK-NO-PGSO-NEXT: entry: +; CHECK-NO-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NO-PGSO: vector.scevcheck: +; CHECK-NO-PGSO-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[BSTRIDE:%.*]], 1 +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]] +; CHECK-NO-PGSO-NEXT: br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8 +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = mul nsw i32 [[TMP1]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = mul nsw i32 [[TMP2]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] +; CHECK-NO-PGSO-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP4]] +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <8 x i16>* +; CHECK-NO-PGSO-NEXT: store <8 x i16> , <8 x i16>* [[TMP8]], align 4 +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 8 +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* +; CHECK-NO-PGSO-NEXT: store <8 x i16> , <8 x i16>* [[TMP10]], align 4 +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: [[CMP_N:%.*]] = icmp eq i32 1025, 1024 +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NO-PGSO: scalar.ph: +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NO-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NO-PGSO: for.body: +; CHECK-NO-PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NO-PGSO-NEXT: [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]] +; CHECK-NO-PGSO-NEXT: [[GEPOFB:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[MULB]] +; CHECK-NO-PGSO-NEXT: store i16 42, i16* [[GEPOFB]], align 4 +; CHECK-NO-PGSO-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NO-PGSO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025 +; CHECK-NO-PGSO-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NO-PGSO: for.end: +; CHECK-NO-PGSO-NEXT: ret void +; +entry: + br label %for.body -for.cond: - %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ] - %conv = and i32 %d.0, 65535 - %cmp = icmp ult i32 %conv, 4 - %add = add nuw nsw i32 %conv, 1 - br i1 %cmp, label %for.cond, label %while.cond.loopexit +for.body: + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] + %mulB = mul nsw i32 %iv, %BStride + %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB + store i16 42, i16* %gepOfB, align 4 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, 1025 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15 -while.cond.loopexit: - ret i32 0 +for.end: + ret void } + +; PR46652: Check that the need for stride==1 check prevents vectorizing a loop +; having tiny trip count, when compiling w/o -Os/-Oz. + +@g = external global [1 x i16], align 1 + +define void @pr46652(i16 %stride) { +; CHECK-LABEL: @pr46652( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[L1_02:%.*]] = phi i16 [ 1, [[ENTRY:%.*]] ], [ [[INC9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i16 [[L1_02]], [[STRIDE:%.*]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* @g, i16 0, i16 [[MUL]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[INC9]] = add nuw nsw i16 [[L1_02]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i16 [[INC9]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %l1.02 = phi i16 [ 1, %entry ], [ %inc9, %for.body ] + %mul = mul nsw i16 %l1.02, %stride + %arrayidx6 = getelementptr inbounds [1 x i16], [1 x i16]* @g, i16 0, i16 %mul + %0 = load i16, i16* %arrayidx6, align 1 + %inc9 = add nuw nsw i16 %l1.02, 1 + %exitcond.not = icmp eq i16 %inc9, 16 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure we do not crash while building the VPlan for the loop with the +; select below. +define i32 @PR48142(i32* %ptr.start, i32* %ptr.end) optsize { +; CHECK-PGSO-LABEL: @PR48142( +; CHECK-PGSO-NEXT: entry: +; CHECK-PGSO-NEXT: [[PTR_START2:%.*]] = ptrtoint i32* [[PTR_START:%.*]] to i32 +; CHECK-PGSO-NEXT: [[PTR_END1:%.*]] = ptrtoint i32* [[PTR_END:%.*]] to i32 +; CHECK-PGSO-NEXT: [[TMP0:%.*]] = add i32 [[PTR_END1]], -4 +; CHECK-PGSO-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[PTR_START2]] +; CHECK-PGSO-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 +; CHECK-PGSO-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 +; CHECK-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-PGSO: vector.ph: +; CHECK-PGSO-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP3]], 3 +; CHECK-PGSO-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-PGSO-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-PGSO-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[N_VEC]] +; CHECK-PGSO-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1 +; CHECK-PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-PGSO: vector.body: +; CHECK-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ] +; CHECK-PGSO-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[PRED_STORE_CONTINUE13]] ] +; CHECK-PGSO-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0 +; CHECK-PGSO-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-PGSO-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-PGSO-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-PGSO-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], +; CHECK-PGSO-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-PGSO-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-PGSO: pred.store.if: +; CHECK-PGSO-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 +; CHECK-PGSO-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP8]] +; CHECK-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-PGSO: pred.store.continue: +; CHECK-PGSO-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-PGSO-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK-PGSO: pred.store.if8: +; CHECK-PGSO-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 1 +; CHECK-PGSO-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP10]] +; CHECK-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP3]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK-PGSO: pred.store.continue9: +; CHECK-PGSO-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-PGSO-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK-PGSO: pred.store.if10: +; CHECK-PGSO-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 2 +; CHECK-PGSO-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP12]] +; CHECK-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP4]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK-PGSO: pred.store.continue11: +; CHECK-PGSO-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-PGSO-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]] +; CHECK-PGSO: pred.store.if12: +; CHECK-PGSO-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 3 +; CHECK-PGSO-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP14]] +; CHECK-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP5]], align 4 +; CHECK-PGSO-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK-PGSO: pred.store.continue13: +; CHECK-PGSO-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]] +; CHECK-PGSO-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-PGSO-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-PGSO-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-PGSO: middle.block: +; CHECK-PGSO-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP15]]) +; CHECK-PGSO-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-PGSO: scalar.ph: +; CHECK-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR_START]], [[ENTRY:%.*]] ] +; CHECK-PGSO-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 20, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-PGSO: for.body: +; CHECK-PGSO-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-PGSO-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-PGSO-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I_014]], 99 +; CHECK-PGSO-NEXT: [[COND]] = select i1 [[CMP4]], i32 99, i32 [[I_014]] +; CHECK-PGSO-NEXT: store i32 0, i32* [[PTR_IV]], align 4 +; CHECK-PGSO-NEXT: [[PTR_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 +; CHECK-PGSO-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[PTR_NEXT]], [[PTR_END]] +; CHECK-PGSO-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-PGSO: exit: +; CHECK-PGSO-NEXT: [[RES:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-PGSO-NEXT: ret i32 [[RES]] +; +; CHECK-NO-PGSO-LABEL: @PR48142( +; CHECK-NO-PGSO-NEXT: entry: +; CHECK-NO-PGSO-NEXT: [[PTR_START2:%.*]] = ptrtoint i32* [[PTR_START:%.*]] to i32 +; CHECK-NO-PGSO-NEXT: [[PTR_END1:%.*]] = ptrtoint i32* [[PTR_END:%.*]] to i32 +; CHECK-NO-PGSO-NEXT: [[TMP0:%.*]] = add i32 [[PTR_END1]], -4 +; CHECK-NO-PGSO-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[PTR_START2]] +; CHECK-NO-PGSO-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 +; CHECK-NO-PGSO-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 +; CHECK-NO-PGSO-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NO-PGSO: vector.ph: +; CHECK-NO-PGSO-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP3]], 3 +; CHECK-NO-PGSO-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NO-PGSO-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NO-PGSO-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[N_VEC]] +; CHECK-NO-PGSO-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1 +; CHECK-NO-PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NO-PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NO-PGSO-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NO-PGSO: vector.body: +; CHECK-NO-PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ] +; CHECK-NO-PGSO-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[PRED_STORE_CONTINUE13]] ] +; CHECK-NO-PGSO-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0 +; CHECK-NO-PGSO-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NO-PGSO-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-NO-PGSO-NEXT: [[TMP4:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NO-PGSO-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], +; CHECK-NO-PGSO-NEXT: [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-NO-PGSO-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NO-PGSO: pred.store.if: +; CHECK-NO-PGSO-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NO-PGSO-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP8]] +; CHECK-NO-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK-NO-PGSO: pred.store.continue: +; CHECK-NO-PGSO-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; CHECK-NO-PGSO: pred.store.if8: +; CHECK-NO-PGSO-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NO-PGSO-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP10]] +; CHECK-NO-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP3]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK-NO-PGSO: pred.store.continue9: +; CHECK-NO-PGSO-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; CHECK-NO-PGSO: pred.store.if10: +; CHECK-NO-PGSO-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NO-PGSO-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP12]] +; CHECK-NO-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP4]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE11]] +; CHECK-NO-PGSO: pred.store.continue11: +; CHECK-NO-PGSO-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 +; CHECK-NO-PGSO-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]] +; CHECK-NO-PGSO: pred.store.if12: +; CHECK-NO-PGSO-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NO-PGSO-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[PTR_START]], i32 [[TMP14]] +; CHECK-NO-PGSO-NEXT: store i32 0, i32* [[NEXT_GEP5]], align 4 +; CHECK-NO-PGSO-NEXT: br label [[PRED_STORE_CONTINUE13]] +; CHECK-NO-PGSO: pred.store.continue13: +; CHECK-NO-PGSO-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]] +; CHECK-NO-PGSO-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NO-PGSO-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NO-PGSO-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NO-PGSO: middle.block: +; CHECK-NO-PGSO-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP15]]) +; CHECK-NO-PGSO-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NO-PGSO: scalar.ph: +; CHECK-NO-PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR_START]], [[ENTRY:%.*]] ] +; CHECK-NO-PGSO-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 20, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NO-PGSO-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NO-PGSO: for.body: +; CHECK-NO-PGSO-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NO-PGSO-NEXT: [[CMP4:%.*]] = icmp slt i32 [[I_014]], 99 +; CHECK-NO-PGSO-NEXT: [[COND]] = select i1 [[CMP4]], i32 99, i32 [[I_014]] +; CHECK-NO-PGSO-NEXT: store i32 0, i32* [[PTR_IV]], align 4 +; CHECK-NO-PGSO-NEXT: [[PTR_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 +; CHECK-NO-PGSO-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[PTR_NEXT]], [[PTR_END]] +; CHECK-NO-PGSO-NEXT: br i1 [[CMP_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NO-PGSO: exit: +; CHECK-NO-PGSO-NEXT: [[RES:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NO-PGSO-NEXT: ret i32 [[RES]] +; +entry: + br label %for.body + +for.body: + %i.014 = phi i32 [ 20, %entry ], [ %cond, %for.body ] + %ptr.iv = phi i32* [ %ptr.start, %entry ], [ %ptr.next, %for.body ] + %cmp4 = icmp slt i32 %i.014, 99 + %cond = select i1 %cmp4, i32 99, i32 %i.014 + store i32 0, i32* %ptr.iv + %ptr.next = getelementptr inbounds i32, i32* %ptr.iv, i64 1 + %cmp.not = icmp eq i32* %ptr.next, %ptr.end + br i1 %cmp.not, label %exit, label %for.body + +exit: + %res = phi i32 [ %cond, %for.body ] + ret i32 %res +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} +!15 = distinct !{!15, !16} +!16 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll @@ -1,39 +1,501 @@ -; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mcpu=prescott < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; This test verifies that the loop vectorizer will not vectorizes low trip count +; loops that require runtime checks (Trip count is computed with profile info). +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" -target triple = "i386-unknown-freebsd11.0" +target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" +target triple = "x86_64-unknown-linux-gnu" -@big = external global [0 x i32] +@tab = common global [32 x i8] zeroinitializer, align 1 -; PR18049 -; We need to truncate the exit count to i32. This is legal because the -; arithmetic is signed (%inc is nsw). - -; CHECK-LABEL: tripcount -; CHECK: trunc i64 %count to i32 - -define void @tripcount(i64 %count) { +define i32 @foo_low_trip_count1(i32 %bound) { +; Simple loop with low tripcount. Should not be vectorized. +; CHECK-LABEL: @foo_low_trip_count1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; entry: - %cmp6 = icmp sgt i64 %count, 0 - br i1 %cmp6, label %for.body.preheader, label %for.end - -for.body.preheader: br label %for.body -for.body: - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @big, i32 0, i32 %i.07 - %0 = load i32, i32* %arrayidx, align 4 - %neg = xor i32 %0, -1 - store i32 %neg, i32* %arrayidx, align 4 - %inc = add nsw i32 %i.07, 1 - %conv = sext i32 %inc to i64 - %cmp = icmp slt i64 %conv, %count - br i1 %cmp, label %for.body, label %for.end.loopexit +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 -for.end.loopexit: - br label %for.end - -for.end: - ret void +for.end: ; preds = %for.body + ret i32 0 } + +define i32 @foo_low_trip_count2(i32 %bound) !prof !0 { +; The loop has a same invocation count with the function, but has a low +; trip_count per invocation and not worth to vectorize. +; CHECK-LABEL: @foo_low_trip_count2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 { +; The loop has low invocation count compare to the function invocation count, +; but has a high trip count per invocation. Vectorize it. +; CHECK-LABEL: @foo_low_trip_count3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]], !prof [[PROF2:![0-9]+]] +; CHECK: iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 16 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* [[TMP14]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[BOUND]], 1 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i32 [[TMP16]], 8 +; CHECK-NEXT: [[N_VEC4:%.*]] = sub i32 [[TMP16]], [[N_MOD_VF3]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD8]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> , <8 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP19]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP22]], <8 x i8>* [[TMP23]], align 1 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX5]], 8 +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP24]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i32 [[TMP16]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP25]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br i1 %cond, label %for.preheader, label %for.end, !prof !2 + +for.preheader: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !3 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) { +; Simple loop with low tripcount and inequality test for exit. +; Should not be vectorized. +; CHECK-LABEL: @foo_low_trip_count_icmp_sgt( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i32 [[I_08]], [[BOUND:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp sgt i32 %i.08, %bound + br i1 %exitcond, label %for.end, label %for.body, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_low_trip_count() { +; Simple loop with constant, small trip count and no profiling info. +; CHECK-LABEL: @const_low_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 2 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 2 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_large_trip_count() { +; Simple loop with constant large trip count and no profiling info. +; CHECK-LABEL: @const_large_trip_count( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD1]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* [[TMP12]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 992 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD5]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP19]], <8 x i8> , <8 x i8> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP20]], <8 x i8>* [[TMP21]], align 1 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i32 [[INDEX2]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1000 +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i32 1001, 1000 +; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP23]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 1000 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_small_trip_count_step() { +; Simple loop with static, small trip count and no profiling info. +; CHECK-LABEL: @const_small_trip_count_step( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 5 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 10 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 5 + %exitcond = icmp slt i32 %i.08, 10 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 0 +} + +define i32 @const_trip_over_profile() { +; constant trip count takes precedence over profile data +; CHECK-LABEL: @const_trip_over_profile( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD1]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> , <16 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* [[TMP12]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 992 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD5]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = select <8 x i1> [[TMP19]], <8 x i8> , <8 x i8> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP20]], <8 x i8>* [[TMP21]], align 1 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i32 [[INDEX2]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1000 +; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i32 1001, 1000 +; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP23]], 0 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !prof [[PROF0]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp slt i32 %i.08, 1000 + br i1 %exitcond, label %for.body, label %for.end, !prof !1 + +for.end: ; preds = %for.body + ret i32 0 +} + +; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000, +; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001. +; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1 +; for vectorized and remainder loops, respectively, therefore their +; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with +; loop invocation weights of 10 are the above {10, 2490} and {10, 0}. + +!0 = !{!"function_entry_count", i64 100} +!1 = !{!"branch_weights", i32 100, i32 0} +!2 = !{!"branch_weights", i32 10, i32 90} +!3 = !{!"branch_weights", i32 10, i32 10000} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-sink-scalars-and-merge.ll rename from llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll rename to llvm/test/Transforms/LoopVectorize/X86/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-sink-scalars-and-merge.ll @@ -3,6 +3,7 @@ ; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -debug -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" @a = common global [2048 x i32] zeroinitializer, align 16 @b = common global [2048 x i32] zeroinitializer, align 16 @@ -11,40 +12,43 @@ ; CHECK-LABEL: LV: Checking a loop in "sink1" ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: Successor(s): loop.0 - -; CHECK: loop.0: -; CHECK-NEXT: Successor(s): pred.store - -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) - -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> -; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add> -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue - -; CHECK: pred.store.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%2> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%2> (loop) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<2>, ir<%add> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE store ir<%mul>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: loop.1: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -72,53 +76,43 @@ ; CHECK-LABEL: LV: Checking a loop in "sink2" ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: Successor(s): pred.load - -; CHECK: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) - -; CHECK: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: Successor(s): pred.load.continue - -; CHECK: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%2> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%2> (loop) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<%iv>, ir<2> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: loop.0: -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<2> -; CHECK-NEXT: Successor(s): pred.store - -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) - -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%add> = add vp<[[PRED]]>, ir<10> -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> -; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue - -; CHECK: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } - -; CHECK: loop.1: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -146,53 +140,43 @@ ; CHECK-LABEL: LV: Checking a loop in "sink3" ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: Successor(s): pred.load - -; CHECK: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) - -; CHECK: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) -; CHECK-NEXT: Successor(s): pred.load.continue - -; CHECK: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: EMIT vp<%2> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%2> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%2> (loop) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: REPLICATE ir<%add> = add ir<%lv.b>, ir<10> +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<%iv>, ir<%add> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: loop.0: -; CHECK-NEXT: WIDEN ir<%add> = add vp<[[PRED]]>, ir<10> -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<%add> -; CHECK-NEXT: Successor(s): pred.store - -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) - -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> -; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue - -; CHECK: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } - -; CHECK: loop.1: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -219,67 +203,70 @@ } ; Make sure we do not sink uniform instructions. -define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) { +; ; CHECK-LABEL: LV: Checking a loop in "uniform_gep" -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 21, %iv.next -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[CAN_IV]]> vp<[[BTC]]> -; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> -; CHECK-NEXT: Successor(s): pred.load +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 21, %iv.next +; CHECK-NEXT: EMIT vp<%2> = WIDEN-CANONICAL-INDUCTION +; CHECK-NEXT: EMIT vp<%3> = icmp ule vp<%2> vp<%0> +; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK]]> (loop) +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%3> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-NEXT: CondBit: vp<%3> (loop) ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> -; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv> +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%6> = ir<%lv> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.0: +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k> +; CHECK-NEXT: Successor(s): loop.then +; CHECK-EMPTY: +; CHECK-NEXT: loop.then: +; CHECK-NEXT: EMIT vp<%8> = not ir<%cmp> +; CHECK-NEXT: EMIT vp<%9> = select vp<%3> vp<%8> ir +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%9> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%9> (loop.then) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.B> = getelementptr ir<%B>, ir<%iv> +; CHECK-NEXT: REPLICATE store vp<%6>, ir<%gep.B> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.then.0 +; CHECK-EMPTY: +; CHECK-NEXT: loop.then.0: +; CHECK-NEXT: Successor(s): loop.latch +; CHECK-EMPTY: +; CHECK-NEXT: loop.latch: ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.0 -; CHECK-EMPTY: -; CHECK-NEXT: loop.0: -; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k> -; CHECK-NEXT: Successor(s): loop.then -; CHECK-EMPTY: -; CHECK-NEXT: loop.then: -; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK]]> vp<[[NOT2]]> ir -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK2]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK2]]> (loop.then) -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.B> = getelementptr ir<%B>, ir<%iv> -; CHECK-NEXT: REPLICATE store vp<[[PRED]]>, ir<%gep.B> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.then.0 -; CHECK-EMPTY: -; CHECK-NEXT: loop.then.0: -; CHECK-NEXT: Successor(s): loop.latch -; CHECK-EMPTY: -; CHECK-NEXT: loop.latch: ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; +define void @uniform_gep(i64 %k, i16* noalias %A, i16* noalias %B) { entry: br label %loop @@ -304,72 +291,75 @@ } ; Loop with predicated load. -define void @pred_cfg1(i32 %k, i32 %j) { +; ; CHECK-LABEL: LV: Checking a loop in "pred_cfg1" -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: WIDEN ir<%c.1> = icmp ir<%iv>, ir<%j> -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> -; CHECK-NEXT: Successor(s): then.0 +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN ir<%c.1> = icmp ir<%iv>, ir<%j> +; CHECK-NEXT: Successor(s): then.0 ; CHECK-EMPTY: -; CHECK-NEXT: then.0: -; CHECK-NEXT: EMIT vp<[[MASK1:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1]]> ir<%c.1> ir -; CHECK-NEXT: Successor(s): pred.load +; CHECK-NEXT: then.0: +; CHECK-NEXT: EMIT vp<%3> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: EMIT vp<%4> = select vp<%3> ir<%c.1> ir +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK2]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK2]]> (then.0) +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%4> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-NEXT: CondBit: vp<%4> (then.0) ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) -; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): then.0.0 +; CHECK-EMPTY: +; CHECK-NEXT: then.0.0: +; CHECK-NEXT: Successor(s): next.0 +; CHECK-EMPTY: +; CHECK-NEXT: next.0: +; CHECK-NEXT: EMIT vp<%8> = not ir<%c.1> +; CHECK-NEXT: EMIT vp<%9> = select vp<%3> vp<%8> ir +; CHECK-NEXT: BLEND %p = ir<0>/vp<%9> vp<%7>/vp<%4> +; CHECK-NEXT: EMIT vp<%11> = or vp<%4> vp<%9> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%11> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%11> (next.0) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<%iv>, ir<10> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> +; CHECK-NEXT: REPLICATE store ir<%p>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): next.0.0 +; CHECK-EMPTY: +; CHECK-NEXT: next.0.0: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): then.0.0 -; CHECK-EMPTY: -; CHECK-NEXT: then.0.0: -; CHECK-NEXT: Successor(s): next.0 -; CHECK-EMPTY: -; CHECK-NEXT: next.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.1> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir -; CHECK-NEXT: BLEND %p = ir<0>/vp<[[MASK3]]> vp<[[PRED]]>/vp<[[MASK2]]> -; CHECK-NEXT: EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[OR]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[OR]]> (next.0) -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> -; CHECK-NEXT: REPLICATE store ir<%p>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): next.0.0 -; CHECK-EMPTY: -; CHECK-NEXT: next.0.0: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; + +define void @pred_cfg1(i32 %k, i32 %j) { entry: br label %loop @@ -400,80 +390,82 @@ ; Loop with predicated load and store in separate blocks, store depends on ; loaded value. -define void @pred_cfg2(i32 %k, i32 %j) { +; ; CHECK-LABEL: LV: Checking a loop in "pred_cfg2" -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> -; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> -; CHECK-NEXT: WIDEN ir<%c.1> = icmp ir<%iv>, ir<%j> -; CHECK-NEXT: Successor(s): then.0 +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp ir<%iv>, ir<%j> +; CHECK-NEXT: Successor(s): then.0 ; CHECK-EMPTY: -; CHECK-NEXT: then.0: -; CHECK-NEXT: EMIT vp<[[MASK1:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1]]> ir<%c.0> ir -; CHECK-NEXT: Successor(s): pred.load +; CHECK-NEXT: then.0: +; CHECK-NEXT: EMIT vp<%4> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: EMIT vp<%5> = select vp<%4> ir<%c.0> ir +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK2]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK2]]> (then.0) +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%5> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-NEXT: CondBit: vp<%5> (then.0) ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) -; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%8> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): then.0.0 +; CHECK-EMPTY: +; CHECK-NEXT: then.0.0: +; CHECK-NEXT: Successor(s): next.0 +; CHECK-EMPTY: +; CHECK-NEXT: next.0: +; CHECK-NEXT: EMIT vp<%9> = not ir<%c.0> +; CHECK-NEXT: EMIT vp<%10> = select vp<%4> vp<%9> ir +; CHECK-NEXT: BLEND %p = ir<0>/vp<%10> vp<%8>/vp<%5> +; CHECK-NEXT: Successor(s): then.1 +; CHECK-EMPTY: +; CHECK-NEXT: then.1: +; CHECK-NEXT: EMIT vp<%12> = or vp<%5> vp<%10> +; CHECK-NEXT: EMIT vp<%13> = select vp<%12> ir<%c.1> ir +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%13> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%13> (then.1) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<%iv>, ir<10> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> +; CHECK-NEXT: REPLICATE store ir<%p>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): then.1.0 +; CHECK-EMPTY: +; CHECK-NEXT: then.1.0: +; CHECK-NEXT: Successor(s): next.1 +; CHECK-EMPTY: +; CHECK-NEXT: next.1: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): then.0.0 -; CHECK-EMPTY: -; CHECK-NEXT: then.0.0: -; CHECK-NEXT: Successor(s): next.0 -; CHECK-EMPTY: -; CHECK-NEXT: next.0: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir -; CHECK-NEXT: BLEND %p = ir<0>/vp<[[MASK3]]> vp<[[PRED]]>/vp<[[MASK2]]> -; CHECK-NEXT: Successor(s): then.1 -; CHECK-EMPTY: -; CHECK-NEXT: then.1: -; CHECK-NEXT: EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> -; CHECK-NEXT: EMIT vp<[[MASK4:%.+]]> = select vp<[[OR]]> ir<%c.1> ir -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK4]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK4]]> (then.1) -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> -; CHECK-NEXT: REPLICATE store ir<%p>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): then.1.0 -; CHECK-EMPTY: -; CHECK-NEXT: then.1.0: -; CHECK-NEXT: Successor(s): next.1 -; CHECK-EMPTY: -; CHECK-NEXT: next.1: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; +define void @pred_cfg2(i32 %k, i32 %j) { entry: br label %loop @@ -511,78 +503,80 @@ ; Loop with predicated load and store in separate blocks, store does not depend ; on loaded value. -define void @pred_cfg3(i32 %k, i32 %j) { +; ; CHECK-LABEL: LV: Checking a loop in "pred_cfg3" -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = backedge-taken count ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%iv>, ir<10> -; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> -; CHECK-NEXT: Successor(s): then.0 +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN ir<%c.0> = icmp ir<%iv>, ir<%j> +; CHECK-NEXT: Successor(s): then.0 ; CHECK-EMPTY: -; CHECK-NEXT: then.0: -; CHECK-NEXT: EMIT vp<[[MASK1:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK1:%.+]]> ir<%c.0> ir -; CHECK-NEXT: Successor(s): pred.load +; CHECK-NEXT: then.0: +; CHECK-NEXT: EMIT vp<%3> = icmp ule ir<%iv> vp<%0> +; CHECK-NEXT: EMIT vp<%4> = select vp<%3> ir<%c.0> ir +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK-NEXT: pred.load: { -; CHECK-NEXT: pred.load.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK2]]> -; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue -; CHECK-NEXT: CondBit: vp<[[MASK2]]> (then.0) +; CHECK-NEXT: pred.load: { +; CHECK-NEXT: pred.load.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%4> +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue +; CHECK-NEXT: CondBit: vp<%4> (then.0) ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.if: -; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> -; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> -; CHECK-NEXT: Successor(s): pred.load.continue +; CHECK-NEXT: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%iv> +; CHECK-NEXT: REPLICATE ir<%lv.b> = load ir<%gep.b> +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK-NEXT: pred.load.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<[[PRED:%.+]]> = ir<%lv.b> +; CHECK-NEXT: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%7> = ir<%lv.b> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): then.0.0 +; CHECK-EMPTY: +; CHECK-NEXT: then.0.0: +; CHECK-NEXT: Successor(s): next.0 +; CHECK-EMPTY: +; CHECK-NEXT: next.0: +; CHECK-NEXT: Successor(s): then.1 +; CHECK-EMPTY: +; CHECK-NEXT: then.1: +; CHECK-NEXT: EMIT vp<%8> = not ir<%c.0> +; CHECK-NEXT: EMIT vp<%9> = select vp<%3> vp<%8> ir +; CHECK-NEXT: EMIT vp<%10> = or vp<%4> vp<%9> +; CHECK-NEXT: EMIT vp<%11> = select vp<%10> ir<%c.0> ir +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<%11> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: CondBit: vp<%11> (then.1) +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%mul> = mul ir<%iv>, ir<10> +; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> +; CHECK-NEXT: REPLICATE store ir<0>, ir<%gep.a> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): then.1.0 +; CHECK-EMPTY: +; CHECK-NEXT: then.1.0: +; CHECK-NEXT: Successor(s): next.1 +; CHECK-EMPTY: +; CHECK-NEXT: next.1: +; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> +; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): then.0.0 -; CHECK-EMPTY: -; CHECK-NEXT: then.0.0: -; CHECK-NEXT: Successor(s): next.0 -; CHECK-EMPTY: -; CHECK-NEXT: next.0: -; CHECK-NEXT: Successor(s): then.1 -; CHECK-EMPTY: -; CHECK-NEXT: then.1: -; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%c.0> -; CHECK-NEXT: EMIT vp<[[MASK3:%.+]]> = select vp<[[MASK1]]> vp<[[NOT]]> ir -; CHECK-NEXT: EMIT vp<[[MASK4:%.+]]> = or vp<[[MASK2]]> vp<[[MASK3]]> -; CHECK-NEXT: EMIT vp<[[MASK5:%.+]]> = select vp<[[MASK4]]> ir<%c.0> ir -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK5]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-NEXT: CondBit: vp<[[MASK5]]> (then.1) -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul> -; CHECK-NEXT: REPLICATE store ir<0>, ir<%gep.a> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): then.1.0 -; CHECK-EMPTY: -; CHECK-NEXT: then.1.0: -; CHECK-NEXT: Successor(s): next.1 -; CHECK-EMPTY: -; CHECK-NEXT: next.1: -; CHECK-NEXT: CLONE ir<%large> = icmp ir<%iv>, ir<8> -; CHECK-NEXT: CLONE ir<%exitcond> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; +define void @pred_cfg3(i32 %k, i32 %j) { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -1103,8 +1103,8 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE44:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE44]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0 @@ -1277,148 +1277,115 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP104]], i8* [[TMP103]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = sub i8 0, [[TMP105]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP106]], i8* [[TMP108]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.store.continue: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if31: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP108]], i8* [[TMP107]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = sub i8 0, [[TMP113]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP114]], i8* [[TMP116]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE32]] ; DISABLED_MASKED_STRIDED: pred.store.continue32: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if33: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = sub i8 0, [[TMP121]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP122]], i8* [[TMP124]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] ; DISABLED_MASKED_STRIDED: pred.store.continue34: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if35: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP116]], i8* [[TMP115]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = sub i8 0, [[TMP129]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP130]], i8* [[TMP132]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] ; DISABLED_MASKED_STRIDED: pred.store.continue36: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP133]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if37: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP134]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP136]], i8* [[TMP135]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = sub i8 0, [[TMP137]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP138]], i8* [[TMP140]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] ; DISABLED_MASKED_STRIDED: pred.store.continue38: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP141]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if39: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP124]], i8* [[TMP123]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP142]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP144]], i8* [[TMP143]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = sub i8 0, [[TMP145]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], i8* [[TMP148]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] ; DISABLED_MASKED_STRIDED: pred.store.continue40: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP149]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if41: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP150]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP152]], i8* [[TMP151]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = sub i8 0, [[TMP153]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], i8* [[TMP156]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] ; DISABLED_MASKED_STRIDED: pred.store.continue42: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP157]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44]] ; DISABLED_MASKED_STRIDED: pred.store.if43: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP132]], i8* [[TMP131]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] -; DISABLED_MASKED_STRIDED: pred.store.continue44: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if45: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] -; DISABLED_MASKED_STRIDED: pred.store.continue46: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if47: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] -; DISABLED_MASKED_STRIDED: pred.store.continue48: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if49: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP145]], i8* [[TMP144]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] -; DISABLED_MASKED_STRIDED: pred.store.continue50: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if51: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] -; DISABLED_MASKED_STRIDED: pred.store.continue52: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if53: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP153]], i8* [[TMP152]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] -; DISABLED_MASKED_STRIDED: pred.store.continue54: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if55: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] -; DISABLED_MASKED_STRIDED: pred.store.continue56: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if57: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP161]], i8* [[TMP160]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] -; DISABLED_MASKED_STRIDED: pred.store.continue58: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]] -; DISABLED_MASKED_STRIDED: pred.store.if59: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP158]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP160]], i8* [[TMP159]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = sub i8 0, [[TMP161]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] -; DISABLED_MASKED_STRIDED: pred.store.continue60: +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], i8* [[TMP164]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] +; DISABLED_MASKED_STRIDED: pred.store.continue44: ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP166]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP165]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void ; @@ -1523,8 +1490,8 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE49:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE49]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 @@ -1697,148 +1664,115 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP109]], i8* [[TMP108]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = sub i8 0, [[TMP110]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP111]], i8* [[TMP113]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.store.continue: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP110]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP114]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if36: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP111]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP113]], i8* [[TMP112]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP117]], i8* [[TMP116]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = sub i8 0, [[TMP118]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP119]], i8* [[TMP121]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE37]] ; DISABLED_MASKED_STRIDED: pred.store.continue37: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP114]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP122]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if38: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP117]], i8* [[TMP116]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP125]], i8* [[TMP124]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = sub i8 0, [[TMP126]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP127]], i8* [[TMP129]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE39]] ; DISABLED_MASKED_STRIDED: pred.store.continue39: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP118]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP130]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if40: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP119]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP121]], i8* [[TMP120]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP133]], i8* [[TMP132]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = sub i8 0, [[TMP134]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP135]], i8* [[TMP137]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE41]] ; DISABLED_MASKED_STRIDED: pred.store.continue41: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP122]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if42: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP125]], i8* [[TMP124]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = sub i8 0, [[TMP142]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP143]], i8* [[TMP145]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE43]] ; DISABLED_MASKED_STRIDED: pred.store.continue43: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP126]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if44: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP127]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP129]], i8* [[TMP128]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = sub i8 0, [[TMP150]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP151]], i8* [[TMP153]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE45]] ; DISABLED_MASKED_STRIDED: pred.store.continue45: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP130]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if46: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP133]], i8* [[TMP132]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = sub i8 0, [[TMP158]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP159]], i8* [[TMP161]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE47]] ; DISABLED_MASKED_STRIDED: pred.store.continue47: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49]] ; DISABLED_MASKED_STRIDED: pred.store.if48: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE49]] -; DISABLED_MASKED_STRIDED: pred.store.continue49: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = sub <8 x i8> zeroinitializer, [[TMP105]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP139]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if50: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP140]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i8> [[TMP138]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP142]], i8* [[TMP141]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE51]] -; DISABLED_MASKED_STRIDED: pred.store.continue51: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP143]], label [[PRED_STORE_IF52:%.*]], label [[PRED_STORE_CONTINUE53:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if52: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i8> [[TMP138]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], i8* [[TMP145]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE53]] -; DISABLED_MASKED_STRIDED: pred.store.continue53: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP147]], label [[PRED_STORE_IF54:%.*]], label [[PRED_STORE_CONTINUE55:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if54: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP148]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i8> [[TMP138]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP150]], i8* [[TMP149]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE55]] -; DISABLED_MASKED_STRIDED: pred.store.continue55: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP151]], label [[PRED_STORE_IF56:%.*]], label [[PRED_STORE_CONTINUE57:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if56: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i8> [[TMP138]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], i8* [[TMP153]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE57]] -; DISABLED_MASKED_STRIDED: pred.store.continue57: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP155]], label [[PRED_STORE_IF58:%.*]], label [[PRED_STORE_CONTINUE59:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if58: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP156]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i8> [[TMP138]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP158]], i8* [[TMP157]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE59]] -; DISABLED_MASKED_STRIDED: pred.store.continue59: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP159]], label [[PRED_STORE_IF60:%.*]], label [[PRED_STORE_CONTINUE61:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if60: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i8> [[TMP138]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], i8* [[TMP161]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE61]] -; DISABLED_MASKED_STRIDED: pred.store.continue61: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP163]], label [[PRED_STORE_IF62:%.*]], label [[PRED_STORE_CONTINUE63:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if62: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP164]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = extractelement <8 x i8> [[TMP138]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP166]], i8* [[TMP165]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE63]] -; DISABLED_MASKED_STRIDED: pred.store.continue63: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP167]], label [[PRED_STORE_IF64:%.*]], label [[PRED_STORE_CONTINUE65]] -; DISABLED_MASKED_STRIDED: pred.store.if64: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = sub i8 0, [[TMP166]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP170:%.*]] = extractelement <8 x i8> [[TMP138]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP170]], i8* [[TMP169]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE65]] -; DISABLED_MASKED_STRIDED: pred.store.continue65: +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP167]], i8* [[TMP169]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE49]] +; DISABLED_MASKED_STRIDED: pred.store.continue49: ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP171:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP171]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP170:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP170]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.body: ; DISABLED_MASKED_STRIDED-NEXT: [[IX_024:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[ENTRY:%.*]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]] @@ -1846,12 +1780,12 @@ ; DISABLED_MASKED_STRIDED: if.then: ; DISABLED_MASKED_STRIDED-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1 ; DISABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[MUL]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP171:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[ADD:%.*]] = or i32 [[MUL]], 1 ; DISABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[ADD]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP173:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: [[CMP_I:%.*]] = icmp slt i8 [[TMP172]], [[TMP173]] -; DISABLED_MASKED_STRIDED-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP173]], i8 [[TMP172]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[CMP_I:%.*]] = icmp slt i8 [[TMP171]], [[TMP172]] +; DISABLED_MASKED_STRIDED-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP172]], i8 [[TMP171]] ; DISABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[MUL]] ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[SPEC_SELECT_I]], i8* [[ARRAYIDX6]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]] @@ -1881,8 +1815,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE49:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE49]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 @@ -2055,148 +1989,115 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0 ; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP109]], i8* [[TMP108]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i8> [[TMP105]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = sub i8 0, [[TMP110]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP111]], i8* [[TMP113]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; ENABLED_MASKED_STRIDED: pred.store.continue: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP110]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP114]], label [[PRED_STORE_IF36:%.*]], label [[PRED_STORE_CONTINUE37:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if36: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP111]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP113]], i8* [[TMP112]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP117]], i8* [[TMP116]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i8> [[TMP105]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = sub i8 0, [[TMP118]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP119]], i8* [[TMP121]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE37]] ; ENABLED_MASKED_STRIDED: pred.store.continue37: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP114]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP122]], label [[PRED_STORE_IF38:%.*]], label [[PRED_STORE_CONTINUE39:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if38: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP117]], i8* [[TMP116]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP125]], i8* [[TMP124]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i8> [[TMP105]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = sub i8 0, [[TMP126]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP127]], i8* [[TMP129]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE39]] ; ENABLED_MASKED_STRIDED: pred.store.continue39: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP118]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP130]], label [[PRED_STORE_IF40:%.*]], label [[PRED_STORE_CONTINUE41:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if40: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP119]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP121]], i8* [[TMP120]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP133]], i8* [[TMP132]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i8> [[TMP105]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = sub i8 0, [[TMP134]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP135]], i8* [[TMP137]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE41]] ; ENABLED_MASKED_STRIDED: pred.store.continue41: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP122]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF42:%.*]], label [[PRED_STORE_CONTINUE43:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if42: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP125]], i8* [[TMP124]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i8> [[TMP105]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = sub i8 0, [[TMP142]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP143]], i8* [[TMP145]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE43]] ; ENABLED_MASKED_STRIDED: pred.store.continue43: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP126]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if44: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP127]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP129]], i8* [[TMP128]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i8> [[TMP105]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = sub i8 0, [[TMP150]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP151]], i8* [[TMP153]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE45]] ; ENABLED_MASKED_STRIDED: pred.store.continue45: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP130]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] ; ENABLED_MASKED_STRIDED: pred.store.if46: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP133]], i8* [[TMP132]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i8> [[TMP105]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = sub i8 0, [[TMP158]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP159]], i8* [[TMP161]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE47]] ; ENABLED_MASKED_STRIDED: pred.store.continue47: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49]] ; ENABLED_MASKED_STRIDED: pred.store.if48: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE49]] -; ENABLED_MASKED_STRIDED: pred.store.continue49: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = sub <8 x i8> zeroinitializer, [[TMP105]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP139]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if50: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = extractelement <8 x i32> [[TMP55]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP140]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i8> [[TMP138]], i64 0 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP142]], i8* [[TMP141]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE51]] -; ENABLED_MASKED_STRIDED: pred.store.continue51: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i1> [[TMP5]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP143]], label [[PRED_STORE_IF52:%.*]], label [[PRED_STORE_CONTINUE53:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if52: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP55]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i8> [[TMP138]], i64 1 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], i8* [[TMP145]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE53]] -; ENABLED_MASKED_STRIDED: pred.store.continue53: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i1> [[TMP5]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP147]], label [[PRED_STORE_IF54:%.*]], label [[PRED_STORE_CONTINUE55:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if54: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = extractelement <8 x i32> [[TMP55]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP148]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i8> [[TMP138]], i64 2 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP150]], i8* [[TMP149]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE55]] -; ENABLED_MASKED_STRIDED: pred.store.continue55: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i1> [[TMP5]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP151]], label [[PRED_STORE_IF56:%.*]], label [[PRED_STORE_CONTINUE57:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if56: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP55]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i8> [[TMP138]], i64 3 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], i8* [[TMP153]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE57]] -; ENABLED_MASKED_STRIDED: pred.store.continue57: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i1> [[TMP5]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP155]], label [[PRED_STORE_IF58:%.*]], label [[PRED_STORE_CONTINUE59:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if58: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = extractelement <8 x i32> [[TMP55]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP156]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i8> [[TMP138]], i64 4 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP158]], i8* [[TMP157]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE59]] -; ENABLED_MASKED_STRIDED: pred.store.continue59: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i1> [[TMP5]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP159]], label [[PRED_STORE_IF60:%.*]], label [[PRED_STORE_CONTINUE61:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if60: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP55]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i8> [[TMP138]], i64 5 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], i8* [[TMP161]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE61]] -; ENABLED_MASKED_STRIDED: pred.store.continue61: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i1> [[TMP5]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP163]], label [[PRED_STORE_IF62:%.*]], label [[PRED_STORE_CONTINUE63:%.*]] -; ENABLED_MASKED_STRIDED: pred.store.if62: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = extractelement <8 x i32> [[TMP55]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP164]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = extractelement <8 x i8> [[TMP138]], i64 6 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP166]], i8* [[TMP165]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE63]] -; ENABLED_MASKED_STRIDED: pred.store.continue63: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = extractelement <8 x i1> [[TMP5]], i64 7 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP167]], label [[PRED_STORE_IF64:%.*]], label [[PRED_STORE_CONTINUE65]] -; ENABLED_MASKED_STRIDED: pred.store.if64: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = extractelement <8 x i8> [[TMP105]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = sub i8 0, [[TMP166]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP168:%.*]] = extractelement <8 x i32> [[TMP55]], i64 7 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP168]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP170:%.*]] = extractelement <8 x i8> [[TMP138]], i64 7 -; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP170]], i8* [[TMP169]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE65]] -; ENABLED_MASKED_STRIDED: pred.store.continue65: +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP167]], i8* [[TMP169]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE49]] +; ENABLED_MASKED_STRIDED: pred.store.continue49: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP171:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP171]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP170:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP170]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.body: ; ENABLED_MASKED_STRIDED-NEXT: [[IX_024:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1024, [[ENTRY:%.*]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]] @@ -2204,12 +2105,12 @@ ; ENABLED_MASKED_STRIDED: if.then: ; ENABLED_MASKED_STRIDED-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[MUL]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP171:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: [[ADD:%.*]] = or i32 [[MUL]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[ADD]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP173:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: [[CMP_I:%.*]] = icmp slt i8 [[TMP172]], [[TMP173]] -; ENABLED_MASKED_STRIDED-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP173]], i8 [[TMP172]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP172:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[CMP_I:%.*]] = icmp slt i8 [[TMP171]], [[TMP172]] +; ENABLED_MASKED_STRIDED-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[CMP_I]], i8 [[TMP172]], i8 [[TMP171]] ; ENABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[MUL]] ; ENABLED_MASKED_STRIDED-NEXT: store i8 [[SPEC_SELECT_I]], i8* [[ARRAYIDX6]], align 1 ; ENABLED_MASKED_STRIDED-NEXT: [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]] @@ -2298,8 +2199,8 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE62:%.*]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE62]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE46:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE46]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], @@ -2474,148 +2375,115 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP104]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i8> [[TMP102]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP106]], i8* [[TMP105]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i8> [[TMP102]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = sub i8 0, [[TMP107]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP109]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP108]], i8* [[TMP110]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.store.continue: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP107]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP111]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if33: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP108]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP110]], i8* [[TMP109]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i32> [[TMP2]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP114]], i8* [[TMP113]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i8> [[TMP102]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = sub i8 0, [[TMP115]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP117]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP116]], i8* [[TMP118]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] ; DISABLED_MASKED_STRIDED: pred.store.continue34: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP111]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP119]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if35: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP114]], i8* [[TMP113]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i32> [[TMP2]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP122]], i8* [[TMP121]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i8> [[TMP102]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = sub i8 0, [[TMP123]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP125]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP124]], i8* [[TMP126]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] ; DISABLED_MASKED_STRIDED: pred.store.continue36: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP115]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP127]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if37: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP116]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP118]], i8* [[TMP117]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i32> [[TMP2]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP130]], i8* [[TMP129]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i8> [[TMP102]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = sub i8 0, [[TMP131]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP133]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP132]], i8* [[TMP134]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] ; DISABLED_MASKED_STRIDED: pred.store.continue38: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP119]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP135]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if39: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP122]], i8* [[TMP121]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i32> [[TMP2]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP136]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP138]], i8* [[TMP137]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i8> [[TMP102]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = sub i8 0, [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP141]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP140]], i8* [[TMP142]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] ; DISABLED_MASKED_STRIDED: pred.store.continue40: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP123]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP143]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if41: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP124]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP126]], i8* [[TMP125]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i32> [[TMP2]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP144]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], i8* [[TMP145]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i8> [[TMP102]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = sub i8 0, [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP149]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP148]], i8* [[TMP150]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] ; DISABLED_MASKED_STRIDED: pred.store.continue42: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP127]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP151]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if43: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP130]], i8* [[TMP129]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i32> [[TMP2]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP152]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], i8* [[TMP153]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i8> [[TMP102]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = sub i8 0, [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP157]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP156]], i8* [[TMP158]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] ; DISABLED_MASKED_STRIDED: pred.store.continue44: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP131]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP159]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46]] ; DISABLED_MASKED_STRIDED: pred.store.if45: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP132]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP134]], i8* [[TMP133]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] -; DISABLED_MASKED_STRIDED: pred.store.continue46: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = sub <8 x i8> zeroinitializer, [[TMP102]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i1> [[TMP3]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP136]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if47: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i32> [[TMP52]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP137]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i8> [[TMP135]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP139]], i8* [[TMP138]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] -; DISABLED_MASKED_STRIDED: pred.store.continue48: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = extractelement <8 x i1> [[TMP3]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP140]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if49: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i32> [[TMP52]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP141]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i8> [[TMP135]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP143]], i8* [[TMP142]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] -; DISABLED_MASKED_STRIDED: pred.store.continue50: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i1> [[TMP3]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP144]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if51: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i32> [[TMP52]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP145]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i8> [[TMP135]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP147]], i8* [[TMP146]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] -; DISABLED_MASKED_STRIDED: pred.store.continue52: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = extractelement <8 x i1> [[TMP3]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP148]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if53: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i32> [[TMP52]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP149]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i8> [[TMP135]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP151]], i8* [[TMP150]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] -; DISABLED_MASKED_STRIDED: pred.store.continue54: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i1> [[TMP3]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP152]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if55: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i32> [[TMP52]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP153]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i8> [[TMP135]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP155]], i8* [[TMP154]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] -; DISABLED_MASKED_STRIDED: pred.store.continue56: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = extractelement <8 x i1> [[TMP3]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP156]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if57: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i32> [[TMP52]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP157]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i8> [[TMP135]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP159]], i8* [[TMP158]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] -; DISABLED_MASKED_STRIDED: pred.store.continue58: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i1> [[TMP3]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP160]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if59: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i32> [[TMP52]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP161]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i8> [[TMP135]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP163]], i8* [[TMP162]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] -; DISABLED_MASKED_STRIDED: pred.store.continue60: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = extractelement <8 x i1> [[TMP3]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP164]], label [[PRED_STORE_IF61:%.*]], label [[PRED_STORE_CONTINUE62]] -; DISABLED_MASKED_STRIDED: pred.store.if61: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i32> [[TMP2]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP160]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], i8* [[TMP161]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i8> [[TMP102]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = sub i8 0, [[TMP163]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i32> [[TMP52]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP165]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = extractelement <8 x i8> [[TMP135]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP167]], i8* [[TMP166]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE62]] -; DISABLED_MASKED_STRIDED: pred.store.continue62: +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP164]], i8* [[TMP166]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] +; DISABLED_MASKED_STRIDED: pred.store.continue46: ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP168:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP168]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP167]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void ; @@ -2740,8 +2608,8 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE44:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE44]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0 @@ -2914,148 +2782,115 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0 ; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP104]], i8* [[TMP103]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i8> [[TMP100]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = sub i8 0, [[TMP105]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP107]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP106]], i8* [[TMP108]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] ; DISABLED_MASKED_STRIDED: pred.store.continue: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if31: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP108]], i8* [[TMP107]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i8> [[TMP100]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = sub i8 0, [[TMP113]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP115]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP114]], i8* [[TMP116]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE32]] ; DISABLED_MASKED_STRIDED: pred.store.continue32: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if33: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i8> [[TMP100]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = sub i8 0, [[TMP121]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP123]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP122]], i8* [[TMP124]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] ; DISABLED_MASKED_STRIDED: pred.store.continue34: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if35: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP116]], i8* [[TMP115]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i8> [[TMP100]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = sub i8 0, [[TMP129]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP131]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP130]], i8* [[TMP132]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] ; DISABLED_MASKED_STRIDED: pred.store.continue36: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP133]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if37: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i32> [[TMP1]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP134]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP136]], i8* [[TMP135]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP100]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = sub i8 0, [[TMP137]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP138]], i8* [[TMP140]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] ; DISABLED_MASKED_STRIDED: pred.store.continue38: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP141]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if39: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP124]], i8* [[TMP123]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i32> [[TMP1]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP142]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP144]], i8* [[TMP143]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP100]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = sub i8 0, [[TMP145]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP146]], i8* [[TMP148]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] ; DISABLED_MASKED_STRIDED: pred.store.continue40: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP149]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if41: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i32> [[TMP1]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP150]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP152]], i8* [[TMP151]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP100]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = sub i8 0, [[TMP153]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP154]], i8* [[TMP156]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] ; DISABLED_MASKED_STRIDED: pred.store.continue42: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP157]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44]] ; DISABLED_MASKED_STRIDED: pred.store.if43: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP132]], i8* [[TMP131]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] -; DISABLED_MASKED_STRIDED: pred.store.continue44: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP0]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if45: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i64 0 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] -; DISABLED_MASKED_STRIDED: pred.store.continue46: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP0]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if47: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i64 1 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] -; DISABLED_MASKED_STRIDED: pred.store.continue48: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i1> [[TMP0]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if49: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i64 2 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP145]], i8* [[TMP144]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] -; DISABLED_MASKED_STRIDED: pred.store.continue50: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP0]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if51: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i64 3 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] -; DISABLED_MASKED_STRIDED: pred.store.continue52: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i1> [[TMP0]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if53: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i64 4 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP153]], i8* [[TMP152]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] -; DISABLED_MASKED_STRIDED: pred.store.continue54: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP0]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if55: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i64 5 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] -; DISABLED_MASKED_STRIDED: pred.store.continue56: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i1> [[TMP0]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] -; DISABLED_MASKED_STRIDED: pred.store.if57: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i64 6 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP161]], i8* [[TMP160]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] -; DISABLED_MASKED_STRIDED: pred.store.continue58: -; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP0]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]] -; DISABLED_MASKED_STRIDED: pred.store.if59: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP158]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP160]], i8* [[TMP159]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP100]], i64 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = sub i8 0, [[TMP161]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i64 7 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i64 7 -; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 -; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] -; DISABLED_MASKED_STRIDED: pred.store.continue60: +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP162]], i8* [[TMP164]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] +; DISABLED_MASKED_STRIDED: pred.store.continue44: ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP166]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP165]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll deleted file mode 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ /dev/null @@ -1,384 +0,0 @@ -; This test verifies that the loop vectorizer will NOT produce a tail -; loop with the optimize for size or the minimize size attributes. -; REQUIRES: asserts -; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -S | FileCheck %s -; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO -; RUN: opt < %s -enable-new-pm=0 -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO -; RUN: opt < %s -passes='require,loop-vectorize' -S | FileCheck %s -; RUN: opt < %s -passes='require,loop-vectorize' -pgso -S | FileCheck %s -check-prefix=PGSO -; RUN: opt < %s -passes='require,loop-vectorize' -pgso=false -S | FileCheck %s -check-prefix=NPGSO - -target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" - -@tab = common global [32 x i8] zeroinitializer, align 1 - -define i32 @foo_optsize() #0 { -; CHECK-LABEL: @foo_optsize( -; CHECK-NOT: <2 x i8> -; CHECK-NOT: <4 x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, 202 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret i32 0 -} - -attributes #0 = { optsize } - -define i32 @foo_minsize() #1 { -; CHECK-LABEL: @foo_minsize( -; CHECK-NOT: <2 x i8> -; CHECK-NOT: <4 x i8> -; CHECK-LABEL: @foo_pgso( - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, 202 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret i32 0 -} - -attributes #1 = { minsize } - -define i32 @foo_pgso() !prof !14 { -; PGSO-LABEL: @foo_pgso( -; PGSO-NOT: <{{[0-9]+}} x i8> -; NPGSO-LABEL: @foo_pgso( -; NPGSO: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, 202 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret i32 0 -} - -; PR43371: don't run into an assert due to emitting SCEV runtime checks -; with OptForSize. -; -@cm_array = external global [2592 x i16], align 1 - -define void @pr43371() optsize { -; -; CHECK-LABEL: @pr43371 -; CHECK-NOT: vector.scevcheck -; -; We do not want to generate SCEV predicates when optimising for size, because -; that will lead to extra code generation such as the SCEV overflow runtime -; checks. Not generating SCEV predicates can still result in vectorisation as -; the non-consecutive loads/stores can be scalarized: -; -; CHECK: vector.body: -; CHECK: store i16 0, i16* %{{.*}}, align 1 -; CHECK: store i16 0, i16* %{{.*}}, align 1 -; CHECK: br i1 {{.*}}, label %vector.body -; -entry: - br label %for.body29 - -for.cond.cleanup28: - unreachable - -for.body29: - %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 - %idxprom34 = zext i16 %add33 to i32 - %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 - store i16 0, i16 * %arrayidx35, align 1 - %inc37 = add i16 %i24.0170, 1 - %cmp26 = icmp ult i16 %inc37, 756 - br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 -} - -define void @pr43371_pgso() !prof !14 { -; -; CHECK-LABEL: @pr43371_pgso -; CHECK-NOT: vector.scevcheck -; -; We do not want to generate SCEV predicates when optimising for size, because -; that will lead to extra code generation such as the SCEV overflow runtime -; checks. Not generating SCEV predicates can still result in vectorisation as -; the non-consecutive loads/stores can be scalarized: -; -; CHECK: vector.body: -; CHECK: store i16 0, i16* %{{.*}}, align 1 -; CHECK: store i16 0, i16* %{{.*}}, align 1 -; CHECK: br i1 {{.*}}, label %vector.body -; -entry: - br label %for.body29 - -for.cond.cleanup28: - unreachable - -for.body29: - %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 - %idxprom34 = zext i16 %add33 to i32 - %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 - store i16 0, i16 * %arrayidx35, align 1 - %inc37 = add i16 %i24.0170, 1 - %cmp26 = icmp ult i16 %inc37, 756 - br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 -} - -; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out. -; -define i32 @pr45526() optsize { -; -; CHECK-LABEL: @pr45526 -; CHECK-NEXT: entry: -; CHECK-NEXT: br label %loop -; CHECK-EMPTY: -; CHECK-NEXT: loop: -; CHECK-NEXT: %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] -; CHECK-NEXT: %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] -; CHECK-NEXT: %pivPlus1 = add nuw nsw i32 %piv, 1 -; CHECK-NEXT: %cond = icmp ult i32 %piv, 510 -; CHECK-NEXT: br i1 %cond, label %loop, label %exit -; CHECK-EMPTY: -; CHECK-NEXT: exit: -; CHECK-NEXT: %for.lcssa = phi i32 [ %for, %loop ] -; CHECK-NEXT: ret i32 %for.lcssa -; -entry: - br label %loop - -loop: - %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] - %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] - %pivPlus1 = add nuw nsw i32 %piv, 1 - %cond = icmp ult i32 %piv, 510 - br i1 %cond, label %loop, label %exit - -exit: - ret i32 %for -} - -define i32 @pr45526_pgso() !prof !14 { -; -; CHECK-LABEL: @pr45526_pgso -; CHECK-NEXT: entry: -; CHECK-NEXT: br label %loop -; CHECK-EMPTY: -; CHECK-NEXT: loop: -; CHECK-NEXT: %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] -; CHECK-NEXT: %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] -; CHECK-NEXT: %pivPlus1 = add nuw nsw i32 %piv, 1 -; CHECK-NEXT: %cond = icmp ult i32 %piv, 510 -; CHECK-NEXT: br i1 %cond, label %loop, label %exit -; CHECK-EMPTY: -; CHECK-NEXT: exit: -; CHECK-NEXT: %for.lcssa = phi i32 [ %for, %loop ] -; CHECK-NEXT: ret i32 %for.lcssa -; -entry: - br label %loop - -loop: - %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] - %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] - %pivPlus1 = add nuw nsw i32 %piv, 1 - %cond = icmp ult i32 %piv, 510 - br i1 %cond, label %loop, label %exit - -exit: - ret i32 %for -} - -; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled -; vectorization. - -; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py -define void @stride1(i16* noalias %B, i32 %BStride) optsize { -; CHECK-LABEL: @stride1( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[BSTRIDE:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP0:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]] -; CHECK-NEXT: store i16 42, i16* [[TMP4]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]] -; CHECK-NEXT: store i16 42, i16* [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !21 -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK: for.end: -; CHECK-NEXT: ret void -; -; PGSO-LABEL: @stride1( -; PGSO-NEXT: entry: -; PGSO-NEXT: br i1 false, label %scalar.ph, label %vector.ph -; -; NPGSO-LABEL: @stride1( -; NPGSO-NEXT: entry: -; NPGSO-NEXT: br i1 false, label %scalar.ph, label %vector.ph - -entry: - br label %for.body - -for.body: - %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] - %mulB = mul nsw i32 %iv, %BStride - %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB - store i16 42, i16* %gepOfB, align 4 - %iv.next = add nuw nsw i32 %iv, 1 - %exitcond = icmp eq i32 %iv.next, 1025 - br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15 - -for.end: - ret void -} - -; Vectorize with versioning for unit stride for PGSO and enabled vectorization. -; -define void @stride1_pgso(i16* noalias %B, i32 %BStride) !prof !14 { -; CHECK-LABEL: @stride1_pgso( -; CHECK: vector.body -; -; PGSO-LABEL: @stride1_pgso( -; PGSO: vector.body -; -; NPGSO-LABEL: @stride1_pgso( -; NPGSO: vector.body - -entry: - br label %for.body - -for.body: - %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] - %mulB = mul nsw i32 %iv, %BStride - %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB - store i16 42, i16* %gepOfB, align 4 - %iv.next = add nuw nsw i32 %iv, 1 - %exitcond = icmp eq i32 %iv.next, 1025 - br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15 - -for.end: - ret void -} - -; PR46652: Check that the need for stride==1 check prevents vectorizing a loop -; having tiny trip count, when compiling w/o -Os/-Oz. -; CHECK-LABEL: @pr46652 -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body -; CHECK-LABEL: for.body - -@g = external global [1 x i16], align 1 - -define void @pr46652(i16 %stride) { -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %l1.02 = phi i16 [ 1, %entry ], [ %inc9, %for.body ] - %mul = mul nsw i16 %l1.02, %stride - %arrayidx6 = getelementptr inbounds [1 x i16], [1 x i16]* @g, i16 0, i16 %mul - %0 = load i16, i16* %arrayidx6, align 1 - %inc9 = add nuw nsw i16 %l1.02, 1 - %exitcond.not = icmp eq i16 %inc9, 16 - br i1 %exitcond.not, label %for.end, label %for.body - -for.end: ; preds = %for.body - ret void -} - -; Make sure we do not crash while building the VPlan for the loop with the -; select below. -define i32 @PR48142(i32* %ptr.start, i32* %ptr.end) optsize { -; CHECK-LABEL: PR48142 -; CHECK-NOT: vector.body -entry: - br label %for.body - -for.body: - %i.014 = phi i32 [ 20, %entry ], [ %cond, %for.body ] - %ptr.iv = phi i32* [ %ptr.start, %entry ], [ %ptr.next, %for.body ] - %cmp4 = icmp slt i32 %i.014, 99 - %cond = select i1 %cmp4, i32 99, i32 %i.014 - store i32 0, i32* %ptr.iv - %ptr.next = getelementptr inbounds i32, i32* %ptr.iv, i64 1 - %cmp.not = icmp eq i32* %ptr.next, %ptr.end - br i1 %cmp.not, label %exit, label %for.body - -exit: - %res = phi i32 [ %cond, %for.body ] - ret i32 %res -} - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"ProfileSummary", !1} -!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} -!2 = !{!"ProfileFormat", !"InstrProf"} -!3 = !{!"TotalCount", i64 10000} -!4 = !{!"MaxCount", i64 10} -!5 = !{!"MaxInternalCount", i64 1} -!6 = !{!"MaxFunctionCount", i64 1000} -!7 = !{!"NumCounts", i64 3} -!8 = !{!"NumFunctions", i64 3} -!9 = !{!"DetailedSummary", !10} -!10 = !{!11, !12, !13} -!11 = !{i32 10000, i64 100, i32 1} -!12 = !{i32 999000, i64 100, i32 1} -!13 = !{i32 999999, i64 1, i32 2} -!14 = !{!"function_entry_count", i64 0} -!15 = distinct !{!15, !16} -!16 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll deleted file mode 100644 --- a/llvm/test/Transforms/LoopVectorize/tripcount.ll +++ /dev/null @@ -1,222 +0,0 @@ -; This test verifies that the loop vectorizer will not vectorizes low trip count -; loops that require runtime checks (Trip count is computed with profile info). -; REQUIRES: asserts -; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s - -target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" - -@tab = common global [32 x i8] zeroinitializer, align 1 - -define i32 @foo_low_trip_count1(i32 %bound) { -; Simple loop with low tripcount. Should not be vectorized. - -; CHECK-LABEL: @foo_low_trip_count1( -; CHECK-NOT: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, %bound - br i1 %exitcond, label %for.end, label %for.body, !prof !1 - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @foo_low_trip_count2(i32 %bound) !prof !0 { -; The loop has a same invocation count with the function, but has a low -; trip_count per invocation and not worth to vectorize. - -; CHECK-LABEL: @foo_low_trip_count2( -; CHECK-NOT: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, %bound - br i1 %exitcond, label %for.end, label %for.body, !prof !1 - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 { -; The loop has low invocation count compare to the function invocation count, -; but has a high trip count per invocation. Vectorize it. - -; CHECK-LABEL: @foo_low_trip_count3( -; CHECK: [[VECTOR_BODY:vector\.body]]: -; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]], -; CHECK: [[FOR_BODY:for\.body]]: -; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]], -entry: - br i1 %cond, label %for.preheader, label %for.end, !prof !2 - -for.preheader: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %i.08, %bound - br i1 %exitcond, label %for.end, label %for.body, !prof !3 - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) { -; Simple loop with low tripcount and inequality test for exit. -; Should not be vectorized. - -; CHECK-LABEL: @foo_low_trip_count_icmp_sgt( -; CHECK-NOT: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp sgt i32 %i.08, %bound - br i1 %exitcond, label %for.end, label %for.body, !prof !1 - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @const_low_trip_count() { -; Simple loop with constant, small trip count and no profiling info. - -; CHECK-LABEL: @const_low_trip_count -; CHECK-NOT: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp slt i32 %i.08, 2 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @const_large_trip_count() { -; Simple loop with constant large trip count and no profiling info. - -; CHECK-LABEL: @const_large_trip_count -; CHECK: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp slt i32 %i.08, 1000 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @const_small_trip_count_step() { -; Simple loop with static, small trip count and no profiling info. - -; CHECK-LABEL: @const_small_trip_count_step -; CHECK-NOT: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 5 - %exitcond = icmp slt i32 %i.08, 10 - br i1 %exitcond, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret i32 0 -} - -define i32 @const_trip_over_profile() { -; constant trip count takes precedence over profile data - -; CHECK-LABEL: @const_trip_over_profile -; CHECK: <{{[0-9]+}} x i8> - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 - %0 = load i8, i8* %arrayidx, align 1 - %cmp1 = icmp eq i8 %0, 0 - %. = select i1 %cmp1, i8 2, i8 1 - store i8 %., i8* %arrayidx, align 1 - %inc = add nsw i32 %i.08, 1 - %exitcond = icmp slt i32 %i.08, 1000 - br i1 %exitcond, label %for.body, label %for.end, !prof !1 - -for.end: ; preds = %for.body - ret i32 0 -} - -; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490} -; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0} -; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000, -; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001. -; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1 -; for vectorized and remainder loops, respectively, therefore their -; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with -; loop invocation weights of 10 are the above {10, 2490} and {10, 0}. - -!0 = !{!"function_entry_count", i64 100} -!1 = !{!"branch_weights", i32 100, i32 0} -!2 = !{!"branch_weights", i32 10, i32 90} -!3 = !{!"branch_weights", i32 10, i32 10000}