diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7032,7 +7032,7 @@ unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); - Type *PtrTy = ToVectorTy(Ptr->getType(), VF); + Type *PtrTy = Ptr->getType(); // Figure out whether the access is strided and get the stride value // if it's known in compile time diff --git a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i16-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX1: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 58 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX1: LV: Found an estimated cost of 116 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 58 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 116 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 218 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 436 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 58 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 116 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 222 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 444 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 -; AVX512: LV: Found an estimated cost of 888 for VF 64 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 62 for VF 16 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 124 for VF 32 For instruction: %valB = load i16, i16* %inB, align 2 +; AVX512: LV: Found an estimated cost of 248 for VF 64 For instruction: %valB = load i16, i16* %inB, align 2 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %valB = load i16, i16* %inB, align 4 define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i32-with-i8-index.ll @@ -17,30 +17,30 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 38 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE2: LV: Found an estimated cost of 76 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 38 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; SSE42: LV: Found an estimated cost of 76 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX1: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 30 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 60 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX1: LV: Found an estimated cost of 120 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 440 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 30 for VF 8 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 60 for VF 16 For instruction: %valB = load i32, i32* %inB, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 120 for VF 32 For instruction: %valB = load i32, i32* %inB, align 4 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, i32* %inB, align 4 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, i32* %inB, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i64-with-i8-index.ll @@ -1,5 +1,5 @@ ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse42 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE2 +; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse42 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX1 ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,-fast-gather --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2-SLOWGATHER ; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2,+fast-gather --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX2-FASTGATHER @@ -17,30 +17,30 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE2: LV: Found an estimated cost of 72 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 18 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 36 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; SSE42: LV: Found an estimated cost of 72 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: %valB = load i64, i64* %inB, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: %valB = load i64, i64* %inB, align 8 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, i64* %inB, align 8 ; AVX2-FASTGATHER: LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, i64* %inB, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/gather-i8-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 39 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE2: LV: Found an estimated cost of 79 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 39 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; SSE42: LV: Found an estimated cost of 79 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX1: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX1: LV: Found an estimated cost of 114 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 114 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 434 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 114 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 442 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 -; AVX512: LV: Found an estimated cost of 884 for VF 64 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 122 for VF 32 For instruction: %valB = load i8, i8* %inB, align 1 +; AVX512: LV: Found an estimated cost of 244 for VF 64 For instruction: %valB = load i8, i8* %inB, align 1 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: %valB = load i8, i8* %inB, align 4 define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-3-indices-0uu.ll @@ -18,13 +18,13 @@ ; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 ; ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, i32* %in0, align 4 -; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX2: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, i32* %in0, align 4 ; AVX2: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, i32* %in0, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll @@ -50,8 +50,8 @@ ; AVX2-FASTGATHER: LV: Found an estimated cost of 40 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX512: LV: Found an estimated cost of 16 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 ; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 ; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i16-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; SSE2: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 16 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 32 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; SSE2: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 28 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 56 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 112 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; SSE42: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 16 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 32 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; SSE42: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 228 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 464 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 -; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 68 for VF 16 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 144 for VF 32 For instruction: store i16 %valB, i16* %out, align 2 +; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction: store i16 %valB, i16* %out, align 2 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i16 %valB, i16* %out define void @test() { diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i32-with-i8-index.ll @@ -17,37 +17,37 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 118 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; SSE2: LV: Found an estimated cost of 236 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 38 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; SSE2: LV: Found an estimated cost of 76 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 118 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; SSE42: LV: Found an estimated cost of 236 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 38 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; SSE42: LV: Found an estimated cost of 76 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i32 %valB, i32* %out, align 4 +; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i32 %valB, i32* %out, align 4 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, i32* %out ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %valB, i32* %out diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i64-with-i8-index.ll @@ -17,37 +17,37 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 58 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 116 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; SSE2: LV: Found an estimated cost of 232 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; SSE2: LV: Found an estimated cost of 72 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 58 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 116 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; SSE42: LV: Found an estimated cost of 232 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 18 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 36 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; SSE42: LV: Found an estimated cost of 72 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 16 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 32 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 64 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 224 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 16 for VF 4 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 32 for VF 8 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 64 for VF 16 For instruction: store i64 %valB, i64* %out, align 8 +; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i64 %valB, i64* %out, align 8 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, i64* %out, align 8 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %valB, i64* %out, align 8 diff --git a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll --- a/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll +++ b/llvm/test/Analysis/CostModel/X86/scatter-i8-with-i8-index.ll @@ -17,45 +17,45 @@ ; CHECK: LV: Checking a loop in "test" ; ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 29 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 59 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; SSE2: LV: Found an estimated cost of 239 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 19 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 39 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; SSE2: LV: Found an estimated cost of 79 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 ; ; SSE42: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 29 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 59 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; SSE42: LV: Found an estimated cost of 239 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 9 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 19 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 39 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; SSE42: LV: Found an estimated cost of 79 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX1: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX1: LV: Found an estimated cost of 128 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX2-SLOWGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-SLOWGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-SLOWGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX2-FASTGATHER: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 108 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 216 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX2-FASTGATHER: LV: Found an estimated cost of 448 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 14 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 28 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 56 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX2-FASTGATHER: LV: Found an estimated cost of 128 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 ; ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 54 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 110 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 220 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 456 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 -; AVX512: LV: Found an estimated cost of 928 for VF 64 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 30 for VF 8 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 60 for VF 16 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 136 for VF 32 For instruction: store i8 %valB, i8* %out, align 1 +; AVX512: LV: Found an estimated cost of 288 for VF 64 For instruction: store i8 %valB, i8* %out, align 1 ; ; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF {{[0-9]+}} For instruction: store i8 %valB, i8* %out define void @test() { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" @@ -10,12 +11,283 @@ @g_ = global i8 0, align 1 @b_ = global i8 0, align 1 -; We don't want to vectorize most loops containing gathers because they are -; expensive. -; Make sure we don't vectorize it. -; CHECK-NOT: x float> - define void @_Z4testmm(i64 %size, i64 %offset) { +; CHECK-LABEL: @_Z4testmm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP76:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP77:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP120:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP121:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP164:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP165:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i64> [[TMP8]], +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i64> [[TMP9]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP11]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[TMP19]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP28]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP30]], i32 2 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = load float, float* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = load float, float* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = load float, float* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = load float, float* [[TMP27]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 1 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 2 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i32 3 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP44]], i32 0 +; CHECK-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP47]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP44]], i32 4 +; CHECK-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, <4 x float>* [[TMP49]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = fmul fast <4 x float> [[TMP35]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP51:%.*]] = fmul fast <4 x float> [[TMP43]], [[WIDE_LOAD9]] +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0 +; CHECK-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP55]], align 4 +; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 4 +; CHECK-NEXT: [[TMP57:%.*]] = bitcast float* [[TMP56]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, <4 x float>* [[TMP57]], align 4 +; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <4 x float> [[TMP50]], [[WIDE_LOAD10]] +; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <4 x float> [[TMP51]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 0 +; CHECK-NEXT: [[TMP63:%.*]] = bitcast float* [[TMP62]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP63]], align 4 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 4 +; CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4 +; CHECK-NEXT: [[TMP66:%.*]] = fmul fast <4 x float> [[TMP58]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <4 x float> [[TMP59]], [[WIDE_LOAD13]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 0 +; CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 4 +; CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = fmul fast <4 x float> [[TMP66]], [[WIDE_LOAD14]] +; CHECK-NEXT: [[TMP75:%.*]] = fmul fast <4 x float> [[TMP67]], [[WIDE_LOAD15]] +; CHECK-NEXT: [[TMP76]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP74]] +; CHECK-NEXT: [[TMP77]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP75]] +; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i64> [[TMP10]], +; CHECK-NEXT: [[TMP79:%.*]] = add <4 x i64> [[TMP11]], +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i64> [[TMP78]], i32 0 +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP80]] +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i64> [[TMP78]], i32 1 +; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP82]] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i64> [[TMP78]], i32 2 +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP84]] +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <4 x i64> [[TMP78]], i32 3 +; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP86]] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <4 x i64> [[TMP79]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP88]] +; CHECK-NEXT: [[TMP90:%.*]] = extractelement <4 x i64> [[TMP79]], i32 1 +; CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP90]] +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i64> [[TMP79]], i32 2 +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP92]] +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i64> [[TMP79]], i32 3 +; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = load float, float* [[TMP81]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load float, float* [[TMP83]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load float, float* [[TMP85]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = load float, float* [[TMP87]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x float> poison, float [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x float> [[TMP100]], float [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x float> [[TMP101]], float [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x float> [[TMP102]], float [[TMP99]], i32 3 +; CHECK-NEXT: [[TMP104:%.*]] = load float, float* [[TMP89]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = load float, float* [[TMP91]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = load float, float* [[TMP93]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = load float, float* [[TMP95]], align 4 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x float> poison, float [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x float> [[TMP108]], float [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x float> [[TMP109]], float [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x float> [[TMP110]], float [[TMP107]], i32 3 +; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP103]] +; CHECK-NEXT: [[TMP113:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP111]] +; CHECK-NEXT: [[TMP114:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP112]] +; CHECK-NEXT: [[TMP115:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP113]] +; CHECK-NEXT: [[TMP116:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP114]] +; CHECK-NEXT: [[TMP117:%.*]] = fmul fast <4 x float> [[WIDE_LOAD13]], [[TMP115]] +; CHECK-NEXT: [[TMP118:%.*]] = fmul fast <4 x float> [[WIDE_LOAD14]], [[TMP116]] +; CHECK-NEXT: [[TMP119:%.*]] = fmul fast <4 x float> [[WIDE_LOAD15]], [[TMP117]] +; CHECK-NEXT: [[TMP120]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP118]] +; CHECK-NEXT: [[TMP121]] = fadd fast <4 x float> [[VEC_PHI3]], [[TMP119]] +; CHECK-NEXT: [[TMP122:%.*]] = add <4 x i64> [[TMP10]], +; CHECK-NEXT: [[TMP123:%.*]] = add <4 x i64> [[TMP11]], +; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i64> [[TMP122]], i32 0 +; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP124]] +; CHECK-NEXT: [[TMP126:%.*]] = extractelement <4 x i64> [[TMP122]], i32 1 +; CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP126]] +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <4 x i64> [[TMP122]], i32 2 +; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP128]] +; CHECK-NEXT: [[TMP130:%.*]] = extractelement <4 x i64> [[TMP122]], i32 3 +; CHECK-NEXT: [[TMP131:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP130]] +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i64> [[TMP123]], i32 0 +; CHECK-NEXT: [[TMP133:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP132]] +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i64> [[TMP123]], i32 1 +; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP134]] +; CHECK-NEXT: [[TMP136:%.*]] = extractelement <4 x i64> [[TMP123]], i32 2 +; CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP136]] +; CHECK-NEXT: [[TMP138:%.*]] = extractelement <4 x i64> [[TMP123]], i32 3 +; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP138]] +; CHECK-NEXT: [[TMP140:%.*]] = load float, float* [[TMP125]], align 4 +; CHECK-NEXT: [[TMP141:%.*]] = load float, float* [[TMP127]], align 4 +; CHECK-NEXT: [[TMP142:%.*]] = load float, float* [[TMP129]], align 4 +; CHECK-NEXT: [[TMP143:%.*]] = load float, float* [[TMP131]], align 4 +; CHECK-NEXT: [[TMP144:%.*]] = insertelement <4 x float> poison, float [[TMP140]], i32 0 +; CHECK-NEXT: [[TMP145:%.*]] = insertelement <4 x float> [[TMP144]], float [[TMP141]], i32 1 +; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x float> [[TMP145]], float [[TMP142]], i32 2 +; CHECK-NEXT: [[TMP147:%.*]] = insertelement <4 x float> [[TMP146]], float [[TMP143]], i32 3 +; CHECK-NEXT: [[TMP148:%.*]] = load float, float* [[TMP133]], align 4 +; CHECK-NEXT: [[TMP149:%.*]] = load float, float* [[TMP135]], align 4 +; CHECK-NEXT: [[TMP150:%.*]] = load float, float* [[TMP137]], align 4 +; CHECK-NEXT: [[TMP151:%.*]] = load float, float* [[TMP139]], align 4 +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x float> poison, float [[TMP148]], i32 0 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x float> [[TMP152]], float [[TMP149]], i32 1 +; CHECK-NEXT: [[TMP154:%.*]] = insertelement <4 x float> [[TMP153]], float [[TMP150]], i32 2 +; CHECK-NEXT: [[TMP155:%.*]] = insertelement <4 x float> [[TMP154]], float [[TMP151]], i32 3 +; CHECK-NEXT: [[TMP156:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP147]] +; CHECK-NEXT: [[TMP157:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP155]] +; CHECK-NEXT: [[TMP158:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP156]] +; CHECK-NEXT: [[TMP159:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP157]] +; CHECK-NEXT: [[TMP160:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP158]] +; CHECK-NEXT: [[TMP161:%.*]] = fmul fast <4 x float> [[WIDE_LOAD13]], [[TMP159]] +; CHECK-NEXT: [[TMP162:%.*]] = fmul fast <4 x float> [[WIDE_LOAD14]], [[TMP160]] +; CHECK-NEXT: [[TMP163:%.*]] = fmul fast <4 x float> [[WIDE_LOAD15]], [[TMP161]] +; CHECK-NEXT: [[TMP164]] = fadd fast <4 x float> [[VEC_PHI5]], [[TMP162]] +; CHECK-NEXT: [[TMP165]] = fadd fast <4 x float> [[VEC_PHI6]], [[TMP163]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP166:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP166]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX18:%.*]] = fadd fast <4 x float> [[TMP165]], [[TMP164]] +; CHECK-NEXT: [[TMP167:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX18]]) +; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[TMP121]], [[TMP120]] +; CHECK-NEXT: [[TMP168:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX16]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP77]], [[TMP76]] +; CHECK-NEXT: [[TMP169:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP169]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP168]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP167]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX17]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V_055:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX19]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET]] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]] +; CHECK-NEXT: [[TMP170:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP171:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP170]], [[TMP171]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP172:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP172]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP173:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP173]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]] +; CHECK-NEXT: [[TMP174:%.*]] = load float, float* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP174]] +; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]] +; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]] +; CHECK-NEXT: [[TMP175:%.*]] = load float, float* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP171]], [[TMP175]] +; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP172]], [[MUL13]] +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP173]], [[MUL15]] +; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP174]], [[MUL17]] +; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]] +; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]] +; CHECK-NEXT: [[TMP176:%.*]] = load float, float* [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP171]], [[TMP176]] +; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP172]], [[MUL23]] +; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP173]], [[MUL25]] +; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP174]], [[MUL27]] +; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]] +; CHECK-NEXT: [[INC]] = add i64 [[V_055]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP167]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP168]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP169]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8 +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: store i8 [[R_0_LCSSA]], i8* @r_, align 1 +; CHECK-NEXT: store i8 [[G_0_LCSSA]], i8* @g_, align 1 +; CHECK-NEXT: store i8 [[B_0_LCSSA]], i8* @b_, align 1 +; CHECK-NEXT: ret void +; entry: %cmp53 = icmp eq i64 %size, 0 br i1 %cmp53, label %for.end, label %for.body.lr.ph diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" @@ -11,14 +12,187 @@ @g_ = global i8 0, align 4 @b_ = global i8 0, align 4 -; We don't want to vectorize most loops containing gathers because they are -; expensive. This function represents a point where vectorization starts to -; become beneficial. -; Make sure we are conservative and don't vectorize it. -; CHECK-NOT: <2 x float> -; CHECK-NOT: <4 x float> - define void @_Z4testmm(i32 %size, i32 %offset) { +; CHECK-LABEL: @_Z4testmm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP60:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <4 x float> [[TMP21]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <4 x float> [[TMP25]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP30]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP29]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP36]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = fmul fast <4 x float> [[TMP33]], [[WIDE_LOAD5]] +; CHECK-NEXT: [[TMP38]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = load float, float* [[TMP41]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, float* [[TMP43]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, float* [[TMP45]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, float* [[TMP47]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x float> [[TMP52]], float [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x float> [[TMP53]], float [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x float> [[TMP54]], float [[TMP51]], i32 3 +; CHECK-NEXT: [[TMP56:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP58]] +; CHECK-NEXT: [[TMP60]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[TMP61]], i32 0 +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i32> [[TMP61]], i32 1 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[TMP61]], i32 2 +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i32> [[TMP61]], i32 3 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = load float, float* [[TMP63]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = load float, float* [[TMP65]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = load float, float* [[TMP67]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = load float, float* [[TMP69]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x float> poison, float [[TMP70]], i32 0 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[TMP71]], i32 1 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[TMP72]], i32 2 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x float> [[TMP76]], float [[TMP73]], i32 3 +; CHECK-NEXT: [[TMP78:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP80]] +; CHECK-NEXT: [[TMP82]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP81]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP82]]) +; CHECK-NEXT: [[TMP85:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP60]]) +; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP38]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V_055:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[V_055]], [[OFFSET]] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[ADD]], 3 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[MUL]] +; CHECK-NEXT: [[TMP87:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 [[V_055]] +; CHECK-NEXT: [[TMP88:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP87]], [[TMP88]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 [[V_055]] +; CHECK-NEXT: [[TMP89:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP89]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 [[V_055]] +; CHECK-NEXT: [[TMP90:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP90]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 [[V_055]] +; CHECK-NEXT: [[TMP91:%.*]] = load float, float* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP91]] +; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]] +; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i32 [[MUL]], 1 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM]] +; CHECK-NEXT: [[TMP92:%.*]] = load float, float* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP88]], [[TMP92]] +; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP89]], [[MUL13]] +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP90]], [[MUL15]] +; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP91]], [[MUL17]] +; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]] +; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i32 [[MUL]], 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM52]] +; CHECK-NEXT: [[TMP93:%.*]] = load float, float* [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP88]], [[TMP93]] +; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP89]], [[MUL23]] +; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP90]], [[MUL25]] +; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP91]], [[MUL27]] +; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]] +; CHECK-NEXT: [[INC]] = add i32 [[V_055]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8 +; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8 +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: store i8 [[R_0_LCSSA]], i8* @r_, align 4 +; CHECK-NEXT: store i8 [[G_0_LCSSA]], i8* @g_, align 4 +; CHECK-NEXT: store i8 [[B_0_LCSSA]], i8* @b_, align 4 +; CHECK-NEXT: ret void +; entry: %cmp53 = icmp eq i32 %size, 0 br i1 %cmp53, label %for.end, label %for.body.lr.ph diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll @@ -11,10 +11,7 @@ @g_ = global i8 0, align 1 @b_ = global i8 0, align 1 -; We don't want to vectorize most loops containing gathers because they are -; expensive. This function represents a point where vectorization starts to -; become beneficial. -; Make sure we are conservative and don't vectorize it. +; This function represents a point where vectorization starts to become beneficial. define void @_Z4testmm(i64 %size, i64 %offset) { ; CHECK-LABEL: @_Z4testmm( @@ -22,52 +19,168 @@ ; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] ; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP60:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <4 x float> [[TMP21]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <4 x float> [[TMP25]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP30]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP32]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <4 x float> [[TMP29]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP36]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = fmul fast <4 x float> [[TMP33]], [[WIDE_LOAD5]] +; CHECK-NEXT: [[TMP38]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = add <4 x i64> [[TMP5]], +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i64> [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP40]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i64> [[TMP39]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i64> [[TMP39]], i32 2 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i64> [[TMP39]], i32 3 +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = load float, float* [[TMP41]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, float* [[TMP43]], align 4 +; CHECK-NEXT: [[TMP50:%.*]] = load float, float* [[TMP45]], align 4 +; CHECK-NEXT: [[TMP51:%.*]] = load float, float* [[TMP47]], align 4 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i32 0 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <4 x float> [[TMP52]], float [[TMP49]], i32 1 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x float> [[TMP53]], float [[TMP50]], i32 2 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x float> [[TMP54]], float [[TMP51]], i32 3 +; CHECK-NEXT: [[TMP56:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP58]] +; CHECK-NEXT: [[TMP60]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP59]] +; CHECK-NEXT: [[TMP61:%.*]] = add <4 x i64> [[TMP5]], +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i64> [[TMP61]], i32 0 +; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i64> [[TMP61]], i32 1 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i64> [[TMP61]], i32 2 +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP66]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i64> [[TMP61]], i32 3 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = load float, float* [[TMP63]], align 4 +; CHECK-NEXT: [[TMP71:%.*]] = load float, float* [[TMP65]], align 4 +; CHECK-NEXT: [[TMP72:%.*]] = load float, float* [[TMP67]], align 4 +; CHECK-NEXT: [[TMP73:%.*]] = load float, float* [[TMP69]], align 4 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <4 x float> poison, float [[TMP70]], i32 0 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <4 x float> [[TMP74]], float [[TMP71]], i32 1 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x float> [[TMP75]], float [[TMP72]], i32 2 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x float> [[TMP76]], float [[TMP73]], i32 3 +; CHECK-NEXT: [[TMP78:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP77]] +; CHECK-NEXT: [[TMP79:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP80]] +; CHECK-NEXT: [[TMP82]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP81]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP83:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP83]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP82]]) +; CHECK-NEXT: [[TMP85:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP60]]) +; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP38]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[V_055:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET:%.*]] +; CHECK-NEXT: [[R_057:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[G_056:%.*]] = phi float [ [[BC_MERGE_RDX6]], [[SCALAR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[V_055:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[B_054:%.*]] = phi float [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET]] ; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP87:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP88:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP87]], [[TMP88]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]] +; CHECK-NEXT: [[TMP89:%.*]] = load float, float* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP89]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]] +; CHECK-NEXT: [[TMP90:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP90]] ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]] -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]] +; CHECK-NEXT: [[TMP91:%.*]] = load float, float* [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP91]] ; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]] ; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]] -; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]] -; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]] +; CHECK-NEXT: [[TMP92:%.*]] = load float, float* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP88]], [[TMP92]] +; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP89]], [[MUL13]] +; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP90]], [[MUL15]] +; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP91]], [[MUL17]] ; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]] ; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2 ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX21]], align 4 -; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]] -; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]] -; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]] -; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]] +; CHECK-NEXT: [[TMP93:%.*]] = load float, float* [[ARRAYIDX21]], align 4 +; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP88]], [[TMP93]] +; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP89]], [[MUL23]] +; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP90]], [[MUL25]] +; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP91]], [[MUL27]] ; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]] ; CHECK-NEXT: [[INC]] = add i64 [[V_055]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.cond.for.end_crit_edge: -; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8 ; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8 ; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -1429,29 +1429,169 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP1]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or <8 x i32> [[TMP1]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP9]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i8> [[TMP7]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.store.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if2: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP13]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i8> [[TMP7]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE3]] +; ENABLED_MASKED_STRIDED: pred.store.continue3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <8 x i8> [[TMP7]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP19]], i8* [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE5]] +; ENABLED_MASKED_STRIDED: pred.store.continue5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP21]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i8> [[TMP7]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP23]], i8* [[TMP22]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE7]] +; ENABLED_MASKED_STRIDED: pred.store.continue7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP25]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i8> [[TMP7]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP27]], i8* [[TMP26]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE9]] +; ENABLED_MASKED_STRIDED: pred.store.continue9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP7]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP31]], i8* [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE11]] +; ENABLED_MASKED_STRIDED: pred.store.continue11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP33]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP7]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP35]], i8* [[TMP34]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE13]] +; ENABLED_MASKED_STRIDED: pred.store.continue13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP37]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i8> [[TMP7]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP39]], i8* [[TMP38]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE15]] +; ENABLED_MASKED_STRIDED: pred.store.continue15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = sub <8 x i8> zeroinitializer, [[TMP7]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if16: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP42]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i8> [[TMP40]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP44]], i8* [[TMP43]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE17]] +; ENABLED_MASKED_STRIDED: pred.store.continue17: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if18: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP46]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = extractelement <8 x i8> [[TMP40]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP48]], i8* [[TMP47]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE19]] +; ENABLED_MASKED_STRIDED: pred.store.continue19: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if20: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP50]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = extractelement <8 x i8> [[TMP40]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP52]], i8* [[TMP51]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE21]] +; ENABLED_MASKED_STRIDED: pred.store.continue21: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if22: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP54]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = extractelement <8 x i8> [[TMP40]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP56]], i8* [[TMP55]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE23]] +; ENABLED_MASKED_STRIDED: pred.store.continue23: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if24: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP58]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = extractelement <8 x i8> [[TMP40]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP60]], i8* [[TMP59]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE25]] +; ENABLED_MASKED_STRIDED: pred.store.continue25: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP61]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if26: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP62]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = extractelement <8 x i8> [[TMP40]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP64]], i8* [[TMP63]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE27]] +; ENABLED_MASKED_STRIDED: pred.store.continue27: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if28: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP66]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = extractelement <8 x i8> [[TMP40]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP68]], i8* [[TMP67]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE29]] +; ENABLED_MASKED_STRIDED: pred.store.continue29: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]] +; ENABLED_MASKED_STRIDED: pred.store.if30: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP70]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = extractelement <8 x i8> [[TMP40]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP72]], i8* [[TMP71]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE31]] +; ENABLED_MASKED_STRIDED: pred.store.continue31: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP73]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; @@ -1865,31 +2005,171 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE33:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE33]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> poison, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = or <8 x i32> [[TMP2]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP2]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i8> [[TMP9]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP13]], i8* [[TMP12]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.store.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP5]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP15]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i8> [[TMP9]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP17]], i8* [[TMP16]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE5]] +; ENABLED_MASKED_STRIDED: pred.store.continue5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <8 x i1> [[TMP5]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP19]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i8> [[TMP9]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP21]], i8* [[TMP20]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE7]] +; ENABLED_MASKED_STRIDED: pred.store.continue7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP5]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP23]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <8 x i8> [[TMP9]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP25]], i8* [[TMP24]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE9]] +; ENABLED_MASKED_STRIDED: pred.store.continue9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP5]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP27]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i8> [[TMP9]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP29]], i8* [[TMP28]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE11]] +; ENABLED_MASKED_STRIDED: pred.store.continue11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = extractelement <8 x i1> [[TMP5]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP31]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i8> [[TMP9]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP33]], i8* [[TMP32]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE13]] +; ENABLED_MASKED_STRIDED: pred.store.continue13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP5]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP2]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP35]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = extractelement <8 x i8> [[TMP9]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP37]], i8* [[TMP36]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE15]] +; ENABLED_MASKED_STRIDED: pred.store.continue15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP5]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if16: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP2]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP39]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i8> [[TMP9]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP41]], i8* [[TMP40]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE17]] +; ENABLED_MASKED_STRIDED: pred.store.continue17: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = sub <8 x i8> zeroinitializer, [[TMP9]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if18: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i32> [[TMP7]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP44]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i8> [[TMP42]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP46]], i8* [[TMP45]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE19]] +; ENABLED_MASKED_STRIDED: pred.store.continue19: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i1> [[TMP5]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP47]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if20: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = extractelement <8 x i32> [[TMP7]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP48]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = extractelement <8 x i8> [[TMP42]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP50]], i8* [[TMP49]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE21]] +; ENABLED_MASKED_STRIDED: pred.store.continue21: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP5]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if22: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = extractelement <8 x i32> [[TMP7]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP52]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = extractelement <8 x i8> [[TMP42]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP54]], i8* [[TMP53]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE23]] +; ENABLED_MASKED_STRIDED: pred.store.continue23: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = extractelement <8 x i1> [[TMP5]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP55]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if24: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = extractelement <8 x i32> [[TMP7]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP56]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = extractelement <8 x i8> [[TMP42]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP58]], i8* [[TMP57]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE25]] +; ENABLED_MASKED_STRIDED: pred.store.continue25: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP5]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP59]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if26: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = extractelement <8 x i32> [[TMP7]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP60]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = extractelement <8 x i8> [[TMP42]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP62]], i8* [[TMP61]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE27]] +; ENABLED_MASKED_STRIDED: pred.store.continue27: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = extractelement <8 x i1> [[TMP5]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if28: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = extractelement <8 x i32> [[TMP7]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP64]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = extractelement <8 x i8> [[TMP42]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP66]], i8* [[TMP65]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE29]] +; ENABLED_MASKED_STRIDED: pred.store.continue29: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = extractelement <8 x i1> [[TMP5]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP67]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if30: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = extractelement <8 x i32> [[TMP7]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP68]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = extractelement <8 x i8> [[TMP42]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP70]], i8* [[TMP69]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE31]] +; ENABLED_MASKED_STRIDED: pred.store.continue31: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = extractelement <8 x i1> [[TMP5]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33]] +; ENABLED_MASKED_STRIDED: pred.store.if32: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = extractelement <8 x i32> [[TMP7]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP72]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP74:%.*]] = extractelement <8 x i8> [[TMP42]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP74]], i8* [[TMP73]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE33]] +; ENABLED_MASKED_STRIDED: pred.store.continue33: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP75:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP75]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; @@ -2303,30 +2583,169 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT2]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP1]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or <8 x i32> [[TMP1]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP9]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i8> [[TMP7]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP11]], i8* [[TMP10]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.store.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if2: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP13]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i8> [[TMP7]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE3]] +; ENABLED_MASKED_STRIDED: pred.store.continue3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <8 x i8> [[TMP7]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP19]], i8* [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE5]] +; ENABLED_MASKED_STRIDED: pred.store.continue5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP21]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i8> [[TMP7]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP23]], i8* [[TMP22]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE7]] +; ENABLED_MASKED_STRIDED: pred.store.continue7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP25]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i8> [[TMP7]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP27]], i8* [[TMP26]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE9]] +; ENABLED_MASKED_STRIDED: pred.store.continue9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = extractelement <8 x i8> [[TMP7]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP31]], i8* [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE11]] +; ENABLED_MASKED_STRIDED: pred.store.continue11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP33]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i8> [[TMP7]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP35]], i8* [[TMP34]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE13]] +; ENABLED_MASKED_STRIDED: pred.store.continue13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP37]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i8> [[TMP7]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP39]], i8* [[TMP38]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE15]] +; ENABLED_MASKED_STRIDED: pred.store.continue15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = sub <8 x i8> zeroinitializer, [[TMP7]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP41]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if16: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP42]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i8> [[TMP40]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP44]], i8* [[TMP43]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE17]] +; ENABLED_MASKED_STRIDED: pred.store.continue17: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP45]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if18: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP46]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = extractelement <8 x i8> [[TMP40]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP48]], i8* [[TMP47]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE19]] +; ENABLED_MASKED_STRIDED: pred.store.continue19: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP49]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if20: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP50]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = extractelement <8 x i8> [[TMP40]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP52]], i8* [[TMP51]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE21]] +; ENABLED_MASKED_STRIDED: pred.store.continue21: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP53]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if22: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP54]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = extractelement <8 x i8> [[TMP40]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP56]], i8* [[TMP55]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE23]] +; ENABLED_MASKED_STRIDED: pred.store.continue23: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if24: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP58]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = extractelement <8 x i8> [[TMP40]], i32 4 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP60]], i8* [[TMP59]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE25]] +; ENABLED_MASKED_STRIDED: pred.store.continue25: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP61]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if26: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP62]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = extractelement <8 x i8> [[TMP40]], i32 5 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP64]], i8* [[TMP63]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE27]] +; ENABLED_MASKED_STRIDED: pred.store.continue27: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if28: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP66]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = extractelement <8 x i8> [[TMP40]], i32 6 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP68]], i8* [[TMP67]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE29]] +; ENABLED_MASKED_STRIDED: pred.store.continue29: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]] +; ENABLED_MASKED_STRIDED: pred.store.if30: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP70]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = extractelement <8 x i8> [[TMP40]], i32 7 +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP72]], i8* [[TMP71]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE31]] +; ENABLED_MASKED_STRIDED: pred.store.continue31: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP73]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -246,31 +246,95 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP3]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 2 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <4 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP6]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = or i64 [[TMP4]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 [[TMP7]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <16 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP9]], i32 2, <16 x i1> [[TMP10]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[FOR_BODY_PREHEADER]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP2]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP7]], i16* [[TMP6]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.store.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if1: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP10]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE2]] +; ENABLED_MASKED_STRIDED: pred.store.continue2: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.store.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP19]], i16* [[TMP18]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.store.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = bitcast i16* [[TMP20]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP21]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = or <4 x i64> [[TMP3]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], i16* [[TMP25]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE9]] +; ENABLED_MASKED_STRIDED: pred.store.continue9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP28]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP30]], i16* [[TMP29]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE11]] +; ENABLED_MASKED_STRIDED: pred.store.continue11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP32]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP34]], i16* [[TMP33]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE13]] +; ENABLED_MASKED_STRIDED: pred.store.continue13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]] +; ENABLED_MASKED_STRIDED: pred.store.if14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP36]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP38]], i16* [[TMP37]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE15]] +; ENABLED_MASKED_STRIDED: pred.store.continue15: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP39]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end.loopexit: ; ENABLED_MASKED_STRIDED-NEXT: br label [[FOR_END]] ; ENABLED_MASKED_STRIDED: for.end: @@ -377,21 +441,53 @@ ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <12 x i16>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <12 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <12 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v12i16.p0v12i16(<12 x i16> [[INTERLEAVED_VEC]], <12 x i16>* [[TMP5]], i32 2, <12 x i1> [[TMP6]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP7]], i16* [[TMP6]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.store.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if1: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP10]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE2]] +; ENABLED_MASKED_STRIDED: pred.store.continue2: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.store.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.store.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.store.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; ENABLED_MASKED_STRIDED-NEXT: store i16 [[TMP19]], i16* [[TMP18]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.store.continue6: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ;