diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6657,6 +6657,44 @@ return Changed; } +/// Order may have elements assigned special value (size) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indices for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +/// \returns Fixed ordering. +static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef Order) { + BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); + const unsigned Sz = NewOrder.size(); + SmallBitVector UsedIndices(Sz); + SmallVector MaskedIndices; + for (int I = 0, E = NewOrder.size(); I < E; ++I) { + if (NewOrder[I] < Sz) + UsedIndices.set(NewOrder[I]); + else + MaskedIndices.push_back(I); + } + if (MaskedIndices.empty()) + return NewOrder; + SmallVector AvailableIndices(MaskedIndices.size()); + unsigned Cnt = 0; + int Idx = UsedIndices.find_first(); + do { + AvailableIndices[Cnt] = Idx; + Idx = UsedIndices.find_next(Idx); + ++Cnt; + } while (Idx > 0); + assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); + for (int I = 0, E = MaskedIndices.size(); I < E; ++I) + NewOrder[MaskedIndices[I]] = AvailableIndices[I]; + return NewOrder; +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -6676,9 +6714,9 @@ // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == Chain.size()) { // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(Chain.rbegin(), Chain.rend()); - llvm::transform(*Order, ReorderedOps.begin(), - [Chain](const unsigned Idx) { return Chain[Idx]; }); + SmallVector ReorderedOps(Chain.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); R.buildTree(ReorderedOps); } if (R.isTreeTinyAndNotFullyVectorizable()) @@ -6952,7 +6990,10 @@ else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (!isPowerOf2_32(OpsWidth)) + continue; + + if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) break; ArrayRef Ops = VL.slice(I, OpsWidth); @@ -6967,17 +7008,15 @@ << "\n"); R.buildTree(Ops); - Optional> Order = R.bestOrder(); - // TODO: check if we can allow reordering for more cases. - if (AllowReorder && Order) { - // TODO: reorder tree nodes without tree rebuilding. - // Conceptually, there is nothing actually preventing us from trying to - // reorder a larger list. In fact, we do exactly this when vectorizing - // reductions. However, at this point, we only expect to get here when - // there are exactly two operations. - assert(Ops.size() == 2); - Value *ReorderedOps[] = {Ops[1], Ops[0]}; - R.buildTree(ReorderedOps, None); + if (AllowReorder) { + Optional> Order = R.bestOrder(); + if (Order) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector ReorderedOps(Ops.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Ops](const unsigned Idx) { return Ops[Idx]; }); + R.buildTree(ReorderedOps); + } } if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -7600,8 +7639,8 @@ "instructions."); // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), - [VL](const unsigned Idx) { return VL[Idx]; }); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) @@ -8169,9 +8208,8 @@ // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. - bool AllowReorder = NumElts == 2; - if (NumElts > 1 && - tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*AllowReorder=*/true)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -173,10 +173,8 @@ ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 ; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3) to <2 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[TMP2]], zeroinitializer ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -188,19 +186,21 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> -; MAX-COST-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP2]], <4 x i32> -; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> +; MAX-COST-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP5]], <4 x i32> +; MAX-COST-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> undef, <4 x i32> +; MAX-COST-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; MAX-COST-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[P27]] +; MAX-COST-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP15]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -1,38 +1,76 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX define <8 x float> @ceil_floor(<8 x float> %a) { -; CHECK-LABEL: @ceil_floor( -; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; CHECK-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; CHECK-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) -; CHECK-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) -; CHECK-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; CHECK-NEXT: [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]]) -; CHECK-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) -; CHECK-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]]) -; CHECK-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]]) -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; SSE-LABEL: @ceil_floor( +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) +; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) +; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R71]] +; +; SLM-LABEL: @ceil_floor( +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R71]] +; +; AVX-LABEL: @ceil_floor( +; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -1,38 +1,76 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=SLM +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=AVX define <8 x float> @ceil_floor(<8 x float> %a) { -; CHECK-LABEL: @ceil_floor( -; CHECK-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 -; CHECK-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; CHECK-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; CHECK-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) -; CHECK-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) -; CHECK-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; CHECK-NEXT: [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]]) -; CHECK-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) -; CHECK-NEXT: [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]]) -; CHECK-NEXT: [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]]) -; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R7]] +; SSE-LABEL: @ceil_floor( +; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SSE-NEXT: [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]]) +; SSE-NEXT: [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]]) +; SSE-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R71]] +; +; SLM-LABEL: @ceil_floor( +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <8 x i32> +; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R71]] +; +; AVX-LABEL: @ceil_floor( +; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <8 x i32> +; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R71]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -236,33 +236,29 @@ ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 ; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 ; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 ; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] ; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] ; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] ; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] -; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <2 x i32> ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SSE-NEXT: ret <8 x i32> [[R7]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP3]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[R71]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> @@ -388,26 +384,102 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @sdiv_v8i32_undefs( +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R71]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -236,33 +236,29 @@ ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 ; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 ; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 ; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 -; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] ; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] ; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] ; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] -; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] -; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <2 x i32> ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SSE-NEXT: ret <8 x i32> [[R7]] +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP3]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[R71]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> @@ -388,26 +384,102 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @sdiv_v8i32_undefs( +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R71]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 +; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; AVX512-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; ; Check that we can commute operands based on the predicate. @@ -235,28 +235,48 @@ } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 -; CHECK-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 -; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] -; CHECK-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] -; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0 -; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 -; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 -; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE-LABEL: @fcmp_ord_uno_v4i32( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; SSE-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 +; SSE-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 +; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; SSE-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; SSE-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] +; SSE-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] +; SSE-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; SSE-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0 +; SSE-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 +; SSE-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 +; SSE-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 +; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[R]] +; +; AVX-LABEL: @fcmp_ord_uno_v4i32( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> undef, <4 x i32> +; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> +; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3 +; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; ; Check that we can commute operands based on the predicate. @@ -235,28 +235,48 @@ } define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { -; CHECK-LABEL: @fcmp_ord_uno_v4i32( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 -; CHECK-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 -; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 -; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] -; CHECK-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] -; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] -; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 -; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 -; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 -; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 -; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; SSE-LABEL: @fcmp_ord_uno_v4i32( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; SSE-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; SSE-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 +; SSE-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 +; SSE-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; SSE-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; SSE-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] +; SSE-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] +; SSE-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; SSE-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 +; SSE-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 +; SSE-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 +; SSE-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 +; SSE-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; SSE-NEXT: ret <4 x i32> [[R]] +; +; AVX-LABEL: @fcmp_ord_uno_v4i32( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; AVX-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; AVX-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 +; AVX-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] +; AVX-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> undef, <4 x i32> +; AVX-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> +; AVX-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3 +; AVX-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> +; AVX-NEXT: ret <4 x i32> [[R]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,35 +55,32 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP17]], i32 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP13]], i32 0 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -144,45 +144,44 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[SHUFFLE1]], <2 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], -; CHECK-NEXT: [[TMP16]] = extractelement <2 x float> [[SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP17]] = extractelement <2 x float> [[SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* +; CHECK-NEXT: [[TMP12]] = load <2 x float>, <2 x float>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], +; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -7,14 +7,12 @@ ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0:%.*]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP6]] to float -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = sitofp i32 [[TMP9]] to float -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP10]], i32 3 -; CHECK-NEXT: ret <4 x float> [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP9]] ; %2 = extractelement <4 x i32> %0, i32 1 %3 = sitofp i32 %2 to float diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -14,55 +14,58 @@ ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 ; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 -; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 -; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i32> [[TMP2]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3 ; CHECK-NEXT: [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4 ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[CONV31_I]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[CONV31_I]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[CONV31_I]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 -; CHECK-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP7]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: [[TMP7:%.*]] = lshr <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_I_I]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_1_I_I]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[SHR_2_I_I]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i32> [[TMP14]], <16 x i32> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8> -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_I_I]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i32> [[TMP15]], <16 x i32> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i32> [[TMP19]], <16 x i32> [[TMP20]], <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = trunc <16 x i32> [[TMP21]] to <16 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = and <16 x i8> [[TMP22]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP31]], <16 x i8>* [[TMP32]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP36]], <16 x i8>* [[TMP37]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,40 +23,41 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 -; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 -; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], +; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 ; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll @@ -413,16 +413,15 @@ ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> -; SSE2-NEXT: [[V11:%.*]] = shufflevector <4 x i64> poison, <4 x i64> [[TMP4]], <4 x i32> -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V11]], i64 [[X2]], i32 2 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> undef, <4 x i32> +; SSE2-NEXT: [[V12:%.*]] = shufflevector <4 x i64> poison, <4 x i64> [[TMP7]], <4 x i32> +; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> undef, <4 x i32> +; SSE2-NEXT: [[V31:%.*]] = shufflevector <4 x i64> [[V12]], <4 x i64> [[TMP8]], <4 x i32> +; SSE2-NEXT: ret <4 x i64> [[V31]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -413,16 +413,15 @@ ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> -; SSE2-NEXT: [[V11:%.*]] = shufflevector <4 x i64> undef, <4 x i64> [[TMP4]], <4 x i32> -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V11]], i64 [[X2]], i32 2 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE2-NEXT: ret <4 x i64> [[V3]] +; SSE2-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> +; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> undef, <4 x i32> +; SSE2-NEXT: [[V12:%.*]] = shufflevector <4 x i64> undef, <4 x i64> [[TMP7]], <4 x i32> +; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> undef, <4 x i32> +; SSE2-NEXT: [[V31:%.*]] = shufflevector <4 x i64> [[V12]], <4 x i64> [[TMP8]], <4 x i32> +; SSE2-NEXT: ret <4 x i64> [[V31]] ; ; SLM-LABEL: @loadext_4i16_to_4i64( ; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1