diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2884,36 +2884,43 @@ case Instruction::InsertElement: { assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); - int Offset = *getInsertIndex(VL[0], 0); - ValueList Operands(VL.size()); - Operands[0] = cast(VL[0])->getOperand(1); - bool IsConsecutive = true; - for (unsigned I = 1, E = VL.size(); I < E; ++I) { - IsConsecutive &= (I == *getInsertIndex(VL[I], 0) - Offset); - Operands[I] = cast(VL[I])->getOperand(1); + // Check that we have a buildvector and not a shuffle of 2 or more + // different vectors. + int NotUsedVectors = 0; + ValueList Operands; + ValueSet SourceVectors; + for (Value *V : VL) + SourceVectors.insert(cast(V)->getOperand(0)); + + for (Value *V : VL) { + auto *I = cast(V); + if (!SourceVectors.contains(I)) { + ++NotUsedVectors; + if (NotUsedVectors >= 2) { + // Found 2nd source vector - cancel. + LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " + "different source vectors.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + BS.cancelScheduling(VL, VL0); + return; + } + } + Operands.push_back(I->getOperand(1)); } - // FIXME: support vectorization of non-consecutive inserts - // using shuffle with its proper cost estimation - if (IsConsecutive) { - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); - TE->setOperand(0, Operands); + TE->setOperand(0, Operands); - ValueList VectorOperands; - for (Value *V : VL) - VectorOperands.push_back(cast(V)->getOperand(0)); + ValueList VectorOperands; + for (Value *V : VL) + VectorOperands.push_back(cast(V)->getOperand(0)); - TE->setOperand(1, VectorOperands); + TE->setOperand(1, VectorOperands); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - return; - } - - LLVM_DEBUG(dbgs() << "SLP: skipping non-consecutive inserts.\n"); - BS.cancelScheduling(VL, VL0); - buildTree_rec(Operands, Depth, UserTreeIdx); + buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } case Instruction::Load: { @@ -3806,21 +3813,40 @@ auto *SrcVecTy = cast(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); + unsigned const NumScalars = VL.size(); APInt DemandedElts = APInt::getNullValue(NumElts); - for (auto *V : VL) - DemandedElts.setBit(*getInsertIndex(V, 0)); + // TODO: Add support for Instruction::InsertValue. + unsigned Offset = UINT_MAX; + bool IsIdentity = true; + SmallVector ShuffleMask(NumElts, UndefMaskElem); + for (unsigned I = 0; I < NumScalars; ++I) { + Value *V = VL[I]; + Optional InsertIdx = getInsertIndex(V, 0); + if (!InsertIdx) + continue; + unsigned Idx = *InsertIdx; + DemandedElts.setBit(Idx); + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + IsIdentity &= Idx - Offset == I; + } + ShuffleMask[Idx] = I; + } InstructionCost Cost = 0; Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, /*Insert*/ true, /*Extract*/ false); - unsigned const NumScalars = VL.size(); - unsigned const Offset = *getInsertIndex(VL[0], 0); - if (NumElts != NumScalars && Offset % NumScalars != 0) + if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) Cost += TTI->getShuffleCost( TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None, Offset, FixedVectorType::get(SrcVecTy->getElementType(), NumScalars)); + else if (!IsIdentity) + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, + ShuffleMask); return Cost; } @@ -4350,8 +4376,31 @@ << "SLP: Current total cost = " << Cost << "\n"); } + // Checks if 2 insertelements are from the same buildvector. + auto &&AreFromSingleVector = [](Value *V1, Value *V2) { + if (V1->getType() != V2->getType()) + return false; + Instruction *IE1 = cast(V1); + Instruction *IE2 = cast(V2); + + do { + if (IE1 == V2 || IE2 == V1) + return true; + if (IE1) + IE1 = dyn_cast(IE1->getOperand(0)); + if (IE2) + IE2 = dyn_cast(IE2->getOperand(0)); + } while (IE1 || IE2); + return false; + }; + SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; + SmallBitVector IsIdentity; + SmallVector VF; + SmallVector> ShuffleMask; + SmallVector FirstUsers; + SmallVector DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -4367,6 +4416,36 @@ if (isa(EU.Scalar->getType())) continue; + // If found user is an insertelement, do not calculate extract cost but try + // to detect it as a final shuffled/identity match. + if (EU.User && isa(EU.User)) { + if (auto *FTy = dyn_cast(EU.User->getType())) { + Optional InsertIdx = getInsertIndex(EU.User, 0); + if (!InsertIdx) + continue; + int VecId = -1; + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + if (AreFromSingleVector(EU.User, FirstUsers[I])) { + VecId = I; + break; + } + } + if (VecId < 0) { + VF.push_back(FTy->getNumElements()); + ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + FirstUsers.push_back(EU.User); + DemandedElts.push_back(APInt::getNullValue(VF.back())); + IsIdentity.push_back(true); + VecId = FirstUsers.size() - 1; + } + int Idx = *InsertIdx; + ShuffleMask[VecId][Idx] = EU.Lane; + IsIdentity.set(IsIdentity.test(VecId) & + (EU.Lane == Idx || EU.Lane == UndefMaskElem)); + DemandedElts[VecId].setBit(Idx); + } + } + // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. @@ -4387,6 +4466,39 @@ InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + if (!IsIdentity.test(I)) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, + cast(FirstUsers[I]->getType()), ShuffleMask[I]); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + unsigned VF = ShuffleMask[I].size(); + for (int &Mask : ShuffleMask[I]) + Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask; + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteTwoSrc, cast(FirstUsers[I]->getType()), + ShuffleMask[I]); + LLVM_DEBUG( + dbgs() + << "SLP: Adding cost " << C + << " for final shuffle of vector node and external insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, + /*Extract*/ false); + Cost -= InsertCost; + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + } #ifndef NDEBUG SmallString<256> Str; @@ -4856,28 +4968,53 @@ const unsigned NumScalars = E->Scalars.size(); // Create InsertVector shuffle if necessary - if (NumElts != NumScalars) { - unsigned MinIndex = *getInsertIndex(E->Scalars[0], 0); - Instruction *FirstInsert = nullptr; - for (auto *Scalar : E->Scalars) - if (!FirstInsert && - !is_contained(E->Scalars, - cast(Scalar)->getOperand(0))) - FirstInsert = cast(Scalar); - - // Create shuffle to resize vector - SmallVector Mask(NumElts, UndefMaskElem); + Instruction *FirstInsert = nullptr; + bool IsIdentity = true; + unsigned Offset = UINT_MAX; + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + if (!FirstInsert && + !is_contained(E->Scalars, cast(Scalar)->getOperand(0))) + FirstInsert = cast(Scalar); + Optional InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx) + continue; + unsigned Idx = *InsertIdx; + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + IsIdentity &= Idx - Offset == I; + } + } + + // Create shuffle to resize vector + SmallVector Mask(NumElts, UndefMaskElem); + if (!IsIdentity) { + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + Optional InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx) + continue; + Mask[*InsertIdx - Offset] = I; + } + } else { std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } + if (!IsIdentity || NumElts != NumScalars) V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask); - const unsigned MaxIndex = MinIndex + NumScalars; - for (unsigned I = 0; I < NumElts; I++) - Mask[I] = - (I < MinIndex || I >= MaxIndex) ? I : NumElts - MinIndex + I; + if (NumElts != NumScalars) { + SmallVector InsertMask(NumElts); + std::iota(InsertMask.begin(), InsertMask.end(), 0); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = NumElts + I; + } V = Builder.CreateShuffleVector( - FirstInsert->getOperand(0), V, Mask, - cast(E->Scalars[NumScalars - 1])->getName()); + FirstInsert->getOperand(0), V, InsertMask, + cast(E->Scalars.back())->getName()); } ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -40,31 +40,11 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select2( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 -; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 -; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0 -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0 -; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i32 [[C3]], 0 -; CHECK-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]] -; CHECK-NEXT: [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]] -; CHECK-NEXT: [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]] -; CHECK-NEXT: [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]] -; CHECK-NEXT: [[RA:%.*]] = insertelement <8 x float> undef, float [[S0]], i32 0 -; CHECK-NEXT: [[RB:%.*]] = insertelement <8 x float> [[RA]], float [[S1]], i32 2 -; CHECK-NEXT: [[RC:%.*]] = insertelement <8 x float> [[RB]], float [[S2]], i32 4 -; CHECK-NEXT: [[RD:%.*]] = insertelement <8 x float> [[RC]], float [[S3]], i32 7 -; CHECK-NEXT: ret <8 x float> [[RD]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <8 x i32> +; CHECK-NEXT: [[RD1:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[RD1]] ; %c0 = extractelement <4 x i32> %c, i32 0 %c1 = extractelement <4 x i32> %c, i32 1