diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3508,6 +3508,9 @@ case Instruction::ExtractValue: case Instruction::ExtractElement: { + // Cost of replacing ExtractElement/ExtractValue scalar instructions with + // the vectorized instructions. + InstructionCost DeadCost = 0; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -3535,12 +3538,10 @@ ReuseShuffleCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } - } - InstructionCost DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + DeadCost = ReuseShuffleCost; + } else if (!E->ReorderIndices.empty()) { + DeadCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { Instruction *EI = cast(VL[I]); @@ -3764,11 +3765,9 @@ Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, alignment, CostKind, VL0); } - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. + if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } @@ -3780,18 +3779,14 @@ Align Alignment = SI->getAlign(); InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - if (NeedToShuffleReuses) - ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) { - // TODO: Merge this shuffle with the ReuseShuffleCost. + if (IsReorder) VecStCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return ReuseShuffleCost + VecStCost - ScalarStCost; + return VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -4289,18 +4284,31 @@ if (E->isSame(VL)) { Value *V = vectorizeTree(E); if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for (int Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UniqueIdxs); + // Reshuffle to get only unique values. + // If some of the scalars are duplicated in the vectorization tree + // entry, we do not vectorize them but instead generate a mask for the + // reuses. But if there are several users of the same entry, they may + // have different vectorization factors. This is especially important + // for PHI nodes. In this case, we need to adapt the resulting + // instruction for the user vectorization factor and have to reshuffle + // it again to take only unique elements of the vector. Before this + // patch, the compiler incorrectly returned reduced vector instruction + // with the same elements, not with the unique ones. + // block: + // %phi = phi <2 x > { .., %entry} {%shuffle, %block} + // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1> + // ... (use %2) + // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2} + // br %block + SmallVector UniqueIdxs; + SmallSet UsedIdxs; + int Pos = 0; + for (int Idx : E->ReuseShuffleIndices) { + if (UsedIdxs.insert(Idx).second) + UniqueIdxs.emplace_back(Pos); + ++Pos; } + V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); } return V; } @@ -4338,6 +4346,64 @@ return Vec; } +namespace { +/// Merges shuffle masks and emits final shuffle instruction, if required. +class ShuffleInstructionBuilder { + IRBuilderBase &Builder; + bool IsFinalized = false; + SmallVector Mask; + +public: + ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {} + + /// Adds a mask, inverting it before applying. + void addInversedMask(ArrayRef SubMask) { + if (SubMask.empty()) + return; + SmallVector NewMask; + inversePermutation(SubMask, NewMask); + addMask(NewMask); + } + + /// Functions adds masks, merging them into single one. + void addMask(ArrayRef SubMask) { + SmallVector NewMask(SubMask.begin(), SubMask.end()); + addMask(NewMask); + } + + void addMask(ArrayRef SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) { + NewMask[I] = E; + continue; + } + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); + } + + Value *finalize(Value *V) { + IsFinalized = true; + if (Mask.empty()) + return V; + return Builder.CreateShuffleVector(V, Mask, "shuffle"); + } + + ~ShuffleInstructionBuilder() { + assert((IsFinalized || Mask.empty()) && + "Shuffle construction must be finalized."); + } +}; +} // namespace + Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); @@ -4346,12 +4412,14 @@ return E->VectorizedValue; } + ShuffleInstructionBuilder ShuffleBuilder(Builder); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); Value *Vec = gather(E->Scalars); if (NeedToShuffleReuses) { - Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + Vec = ShuffleBuilder.finalize(Vec); if (auto *I = dyn_cast(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); @@ -4409,18 +4477,10 @@ case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - SmallVector Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - } + Builder.SetInsertPoint(VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; return V; } @@ -4431,16 +4491,9 @@ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - SmallVector Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, - "shuffle"); - } + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + NewV = ShuffleBuilder.finalize(NewV); E->VectorizedValue = NewV; return NewV; } @@ -4467,8 +4520,8 @@ auto *CI = cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4489,8 +4542,8 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4509,8 +4562,8 @@ } Value *V = Builder.CreateSelect(Cond, True, False); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4532,8 +4585,8 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4575,8 +4628,8 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4618,15 +4671,9 @@ } Value *V = propagateMetadata(NewLI, E->Scalars); - if (IsReorder) { - SmallVector Mask; - inversePermutation(E->ReorderIndices, Mask); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - } + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4640,11 +4687,9 @@ setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); - if (IsReorder) { - SmallVector Mask(E->ReorderIndices.begin(), - E->ReorderIndices.end()); - VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); - } + ShuffleBuilder.addMask(E->ReorderIndices); + VecValue = ShuffleBuilder.finalize(VecValue); + Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); @@ -4658,8 +4703,6 @@ ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); Value *V = propagateMetadata(ST, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4697,8 +4740,8 @@ if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4760,8 +4803,8 @@ ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4827,8 +4870,8 @@ Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -82,8 +82,7 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -35,8 +35,7 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[REORDER_SHUFFLE]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -8,10 +8,10 @@ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]], +; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4 ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5 @@ -66,7 +66,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[SHRINK_SHUFFLE:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: @@ -78,6 +78,7 @@ ; CHECK-NEXT: [[ARRAYIDX11_6:%.*]] = getelementptr inbounds i16, i16* undef, i32 6 ; CHECK-NEXT: [[ARRAYIDX11_7:%.*]] = getelementptr inbounds i16, i16* undef, i32 7 ; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* undef, align 2 +; CHECK-NEXT: [[SHRINK_SHUFFLE]] = shufflevector <8 x i16> [[SHUFFLE]], <8 x i16> poison, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: