diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -544,15 +544,19 @@ /// \returns inserting index of InsertElement or InsertValue instruction, /// using Offset as base offset for index. -static Optional getInsertIndex(Value *InsertInst, unsigned Offset) { - unsigned Index = Offset; +static Optional getInsertIndex(Value *InsertInst, unsigned Offset) { + int Index = Offset; if (auto *IE = dyn_cast(InsertInst)) { if (auto *CI = dyn_cast(IE->getOperand(2))) { auto *VT = cast(IE->getType()); + if (CI->getValue().uge(VT->getNumElements())) + return UndefMaskElem; Index *= VT->getNumElements(); Index += CI->getZExtValue(); return Index; } + if (isa(IE->getOperand(2))) + return UndefMaskElem; return None; } @@ -2884,36 +2888,36 @@ case Instruction::InsertElement: { assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); - int Offset = *getInsertIndex(VL[0], 0); - ValueList Operands(VL.size()); - Operands[0] = cast(VL[0])->getOperand(1); - bool IsConsecutive = true; - for (unsigned I = 1, E = VL.size(); I < E; ++I) { - IsConsecutive &= (I == *getInsertIndex(VL[I], 0) - Offset); - Operands[I] = cast(VL[I])->getOperand(1); + // Check that we have a buildvector and not a shuffle of 2 or more + // different vectors. + ValueSet SourceVectors; + for (Value *V : VL) + SourceVectors.insert(cast(V)->getOperand(0)); + + if (count_if(VL, [&SourceVectors](Value *V) { + return !SourceVectors.contains(V); + }) >= 2) { + // Found 2nd source vector - cancel. + LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " + "different source vectors.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + BS.cancelScheduling(VL, VL0); + return; } - // FIXME: support vectorization of non-consecutive inserts - // using shuffle with its proper cost estimation - if (IsConsecutive) { - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); - TE->setOperand(0, Operands); - - ValueList VectorOperands; + constexpr int NumOps = 2; + ValueList VectorOperands[NumOps]; + for (int I = 0; I < NumOps; ++I) { for (Value *V : VL) - VectorOperands.push_back(cast(V)->getOperand(0)); - - TE->setOperand(1, VectorOperands); + VectorOperands[I].push_back(cast(V)->getOperand(I)); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - return; + TE->setOperand(I, VectorOperands[I]); } - - LLVM_DEBUG(dbgs() << "SLP: skipping non-consecutive inserts.\n"); - BS.cancelScheduling(VL, VL0); - buildTree_rec(Operands, Depth, UserTreeIdx); + buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0}); return; } case Instruction::Load: { @@ -3806,21 +3810,41 @@ auto *SrcVecTy = cast(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); + unsigned const NumScalars = VL.size(); APInt DemandedElts = APInt::getNullValue(NumElts); - for (auto *V : VL) - DemandedElts.setBit(*getInsertIndex(V, 0)); + // TODO: Add support for Instruction::InsertValue. + unsigned Offset = UINT_MAX; + bool IsIdentity = true; + SmallVector ShuffleMask(NumElts, UndefMaskElem); + for (unsigned I = 0; I < NumScalars; ++I) { + Optional InsertIdx = getInsertIndex(VL[I], 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + unsigned Idx = *InsertIdx; + DemandedElts.setBit(Idx); + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + ShuffleMask[Idx] = I; + } + assert(Offset < NumElts && "Failed to find vector index offset"); InstructionCost Cost = 0; Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, /*Insert*/ true, /*Extract*/ false); - unsigned const NumScalars = VL.size(); - unsigned const Offset = *getInsertIndex(VL[0], 0); - if (NumElts != NumScalars && Offset % NumScalars != 0) + if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) Cost += TTI->getShuffleCost( TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None, Offset, FixedVectorType::get(SrcVecTy->getElementType(), NumScalars)); + else if (!IsIdentity) + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, + ShuffleMask); return Cost; } @@ -4352,6 +4376,11 @@ SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; + SmallBitVector IsIdentity; + SmallVector VF; + SmallVector> ShuffleMask; + SmallVector FirstUsers; + SmallVector DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -4367,6 +4396,49 @@ if (isa(EU.Scalar->getType())) continue; + // If found user is an insertelement, do not calculate extract cost but try + // to detect it as a final shuffled/identity match. + if (EU.User && isa(EU.User)) { + if (auto *FTy = dyn_cast(EU.User->getType())) { + Optional InsertIdx = getInsertIndex(EU.User, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + Value *VU = EU.User; + auto *It = find_if(FirstUsers, [VU](Value *V) { + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + auto *IE1 = cast(VU); + auto *IE2 = cast(V); + do { + if (IE1 == VU || IE2 == V) + return true; + if (IE1) + IE1 = dyn_cast(IE1->getOperand(0)); + if (IE2) + IE2 = dyn_cast(IE2->getOperand(0)); + } while (IE1 || IE2); + return false; + }); + int VecId = -1; + if (It == FirstUsers.end()) { + VF.push_back(FTy->getNumElements()); + ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + FirstUsers.push_back(EU.User); + DemandedElts.push_back(APInt::getNullValue(VF.back())); + IsIdentity.push_back(true); + VecId = FirstUsers.size() - 1; + } else { + VecId = std::distance(FirstUsers.begin(), It); + } + int Idx = *InsertIdx; + ShuffleMask[VecId][Idx] = EU.Lane; + IsIdentity.set(IsIdentity.test(VecId) & + (EU.Lane == Idx || EU.Lane == UndefMaskElem)); + DemandedElts[VecId].setBit(Idx); + } + } + // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. @@ -4387,6 +4459,39 @@ InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + if (!IsIdentity.test(I)) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, + cast(FirstUsers[I]->getType()), ShuffleMask[I]); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + unsigned VF = ShuffleMask[I].size(); + for (int &Mask : ShuffleMask[I]) + Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask; + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteTwoSrc, cast(FirstUsers[I]->getType()), + ShuffleMask[I]); + LLVM_DEBUG( + dbgs() + << "SLP: Adding cost " << C + << " for final shuffle of vector node and external insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, + /*Extract*/ false); + Cost -= InsertCost; + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + } #ifndef NDEBUG SmallString<256> Str; @@ -4849,35 +4954,62 @@ } case Instruction::InsertElement: { Builder.SetInsertPoint(VL0); - Value *V = vectorizeTree(E->getOperand(0)); + Value *V = vectorizeTree(E->getOperand(1)); const unsigned NumElts = cast(VL0->getType())->getNumElements(); const unsigned NumScalars = E->Scalars.size(); // Create InsertVector shuffle if necessary - if (NumElts != NumScalars) { - unsigned MinIndex = *getInsertIndex(E->Scalars[0], 0); - Instruction *FirstInsert = nullptr; - for (auto *Scalar : E->Scalars) - if (!FirstInsert && - !is_contained(E->Scalars, - cast(Scalar)->getOperand(0))) - FirstInsert = cast(Scalar); - - // Create shuffle to resize vector - SmallVector Mask(NumElts, UndefMaskElem); + Instruction *FirstInsert = nullptr; + bool IsIdentity = true; + unsigned Offset = UINT_MAX; + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + if (!FirstInsert && + !is_contained(E->Scalars, cast(Scalar)->getOperand(0))) + FirstInsert = cast(Scalar); + Optional InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + unsigned Idx = *InsertIdx; + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + // Create shuffle to resize vector + SmallVector Mask(NumElts, UndefMaskElem); + if (!IsIdentity) { + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + Optional InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + Mask[*InsertIdx - Offset] = I; + } + } else { std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } + if (!IsIdentity || NumElts != NumScalars) V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask); - const unsigned MaxIndex = MinIndex + NumScalars; - for (unsigned I = 0; I < NumElts; I++) - Mask[I] = - (I < MinIndex || I >= MaxIndex) ? I : NumElts - MinIndex + I; + if (NumElts != NumScalars) { + SmallVector InsertMask(NumElts); + std::iota(InsertMask.begin(), InsertMask.end(), 0); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = NumElts + I; + } V = Builder.CreateShuffleVector( - FirstInsert->getOperand(0), V, Mask, - cast(E->Scalars[NumScalars - 1])->getName()); + FirstInsert->getOperand(0), V, InsertMask, + cast(E->Scalars.back())->getName()); } ++NumVectorInstructions; @@ -7565,8 +7697,7 @@ unsigned OperandOffset) { do { Value *InsertedOperand = LastInsertInst->getOperand(1); - Optional OperandIndex = - getInsertIndex(LastInsertInst, OperandOffset); + Optional OperandIndex = getInsertIndex(LastInsertInst, OperandOffset); if (!OperandIndex) return false; if (isa(InsertedOperand) || @@ -7578,14 +7709,11 @@ BuildVectorOpds[*OperandIndex] = InsertedOperand; InsertElts[*OperandIndex] = LastInsertInst; } - if (isa(LastInsertInst->getOperand(0))) - return true; LastInsertInst = dyn_cast(LastInsertInst->getOperand(0)); } while (LastInsertInst != nullptr && (isa(LastInsertInst) || - isa(LastInsertInst)) && - LastInsertInst->hasOneUse()); - return false; + isa(LastInsertInst))); + return true; } /// Recognize construction of vectors like diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll @@ -62,20 +62,18 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptoui <2 x float> [[TMP3]] to <2 x i32> ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 ; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[R31]], <8 x i32> [[TMP5]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R52]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -62,20 +62,18 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 -; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = fptoui <2 x float> [[TMP3]] to <2 x i32> ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 ; SSE-NEXT: [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <8 x i32> +; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[R31]], <8 x i32> [[TMP5]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R52]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -40,31 +40,11 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select2( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 -; CHECK-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; CHECK-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; CHECK-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; CHECK-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] -; CHECK-NEXT: [[RA:%.*]] = insertelement <8 x float> undef, float [[S0]], i32 0 -; CHECK-NEXT: [[RB:%.*]] = insertelement <8 x float> [[RA]], float [[S1]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <8 x float> [[RB]], float [[S2]], i32 4 -; CHECK-NEXT: [[RD:%.*]] = insertelement <8 x float> [[RC]], float [[S3]], i32 7 -; CHECK-NEXT: ret <8 x float> [[RD]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1