diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6629,7 +6629,8 @@ Op = SV->getOperand(1); } if (auto *OpTy = dyn_cast(Op->getType()); - !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) { + !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) || + ShuffleVectorInst::isZeroEltSplatMask(Mask)) { if (IdentityOp) { V = IdentityOp; assert(Mask.size() == IdentityMask.size() && @@ -6661,6 +6662,8 @@ static Value *createShuffle(Value *V1, Value *V2, ArrayRef Mask, ShuffleBuilderTy &Builder) { assert(V1 && "Expected at least one vector value."); + if (V2) + Builder.resizeToMatch(V1, V2); int VF = Mask.size(); if (auto *FTy = dyn_cast(V1->getType())) VF = FTy->getNumElements(); @@ -6748,6 +6751,15 @@ CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); } } + const int Limit = CombinedMask1.size() * 2; + if (Op1 == Op2 && Limit == 2 * VF && + all_of(CombinedMask1, [=](int Idx) { return Idx < Limit; }) && + (ShuffleVectorInst::isIdentityMask(CombinedMask1) || + (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1) && + isa(Op1) && + cast(Op1)->getShuffleMask() == + ArrayRef(CombinedMask1)))) + return Op1; return Builder.createShuffleVector( Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, CombinedMask1); @@ -9294,44 +9306,6 @@ } return VecBase; }; - auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { - unsigned VF1 = cast(V1->getType())->getNumElements(); - unsigned VF2 = cast(V2->getType())->getNumElements(); - unsigned VF = std::max(VF1, VF2); - if (VF1 != VF2) { - SmallVector ExtMask(VF, UndefMaskElem); - std::iota(ExtMask.begin(), std::next(ExtMask.begin(), std::min(VF1, VF2)), - 0); - if (VF1 < VF2) { - V1 = Builder.CreateShuffleVector(V1, ExtMask); - if (auto *I = dyn_cast(V1)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } else { - V2 = Builder.CreateShuffleVector(V2, ExtMask); - if (auto *I = dyn_cast(V2)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - } - } - const int Limit = Mask.size() * 2; - if (V1 == V2 && Mask.size() == VF && - all_of(Mask, [=](int Idx) { return Idx < Limit; }) && - (ShuffleVectorInst::isIdentityMask(Mask) || - (ShuffleVectorInst::isZeroEltSplatMask(Mask) && - isa(V1) && - cast(V1)->getShuffleMask() == Mask))) - return V1; - Value *Vec = V1 == V2 ? Builder.CreateShuffleVector(V1, Mask) - : Builder.CreateShuffleVector(V1, V2, Mask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; - }; auto NeedToDelay = [=](const TreeEntry *E, ArrayRef Deps) -> Value * { // No need to delay emission if all deps are ready. @@ -9438,28 +9412,17 @@ } } if (Vec2) - Vec1 = CreateShuffle(Vec1, Vec2, ExtractMask); + ShuffleBuilder.add(Vec1, Vec2, ExtractMask); else if (Vec1) - Vec1 = CreateShuffle(Vec1, Vec1, ExtractMask); + ShuffleBuilder.add(Vec1, ExtractMask); else - Vec1 = PoisonValue::get( - FixedVectorType::get(ScalarTy, GatheredScalars.size())); + ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( + ScalarTy, GatheredScalars.size())), + ExtractMask); } if (GatherShuffle) { - Vec = CreateShuffle(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - if (Vec1) { - // Build final mask. - for (auto [I, Idx] : enumerate(Mask)) { - if (ExtractMask[I] != UndefMaskElem) - Idx = I; - else if (Idx != UndefMaskElem) - Idx = I + VF; - } - Vec = CreateShuffle(Vec1, Vec, Mask); - } - } else { - Vec = Vec1; + ShuffleBuilder.add(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); } } else if (!allConstant(E->Scalars)) { // TODO: remove this code once able to combine shuffled vectors and build @@ -9555,12 +9518,13 @@ } // Gather unique scalars and all constants. Vec = gather(GatheredScalars); + ShuffleBuilder.add(Vec, ReuseMask); } else { // Gather all constants. Vec = gather(E->Scalars); + ShuffleBuilder.add(Vec, ReuseMask); } - ShuffleBuilder.add(Vec, ReuseMask); Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); if (NeedFreeze) Vec = Builder.CreateFreeze(Vec); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -211,15 +211,14 @@ ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <9 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> ; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_2]]) ; CHECK-NEXT: call void @use(double [[V1_LANE_3]]) -; CHECK-NEXT: store <9 x double> [[TMP4]], ptr [[PTR_1]], align 8 +; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-nodes-dependency.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-nodes-dependency.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-nodes-dependency.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-nodes-dependency.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -10,11 +10,10 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: @@ -23,10 +22,10 @@ ; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP2]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0 ; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]] ; CHECK-NEXT: ret i32 [[OP_RDX9]] ; CHECK: bb5: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -643,12 +643,11 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]] ; SSE-NEXT: ret double [[ADD3]] ; ; AVX-LABEL: @splat_loads( @@ -700,14 +699,13 @@ ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP0]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]] -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP6]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 -; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] +; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] +; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] ; SSE-NEXT: ret double [[RES]] ; ; AVX-LABEL: @splat_loads_with_internal_uses( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -333,16 +333,15 @@ ; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 ; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 ; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y1]], i32 5 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y2]], i32 6 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y3]], i32 7 -; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) -; CHECK-NEXT: ret i1 [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[Y1]], i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y2]], i32 6 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y3]], i32 7 +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -18,11 +18,10 @@ ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer -; CHECK-NEXT: store <4 x float> [[TMP13]], ptr [[RESULT]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer +; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[RESULT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: