diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -331,6 +331,10 @@ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc : TargetTransformInfo::SK_PermuteSingleSrc; } +static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, + SmallVectorImpl &BuildVectorOpds, + SmallVectorImpl *AllInsertInsts, + int &UserCost); namespace { @@ -1471,9 +1475,11 @@ void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); - assert(Operands[OpIdx].size() == 0 && "Already resized?"); - Operands[OpIdx].resize(Scalars.size()); - for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) + //assert(Operands[OpIdx].size() == 0 && "Already resized?"); + //Operands[OpIdx].resize(Scalars.size()); + Operands[OpIdx].resize(OpVL.size()); + //for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) + for (unsigned Lane = 0, E = OpVL.size(); Lane != E; ++Lane) Operands[OpIdx][Lane] = OpVL[Lane]; } @@ -2418,18 +2424,18 @@ } // Don't handle vectors. - if (S.OpValue->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - - if (StoreInst *SI = dyn_cast(S.OpValue)) - if (SI->getValueOperand()->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } + // if (S.OpValue->getType()->isVectorTy()) { + // LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + // newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + // return; + // } + + // if (StoreInst *SI = dyn_cast(S.OpValue)) + // if (SI->getValueOperand()->getType()->isVectorTy()) { + // LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + // newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + // return; + // } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { @@ -2632,6 +2638,68 @@ BS.cancelScheduling(VL, VL0); return; } + case Instruction::InsertElement: { + int UserCost = 0; + ValueList Inserts; + ValueList Operands; + for (Value *V : VL) { + if (!findBuildAggregate(V, TTI, Operands, &Inserts, UserCost)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering insertelement's.\n"); + return; + } + } + // Cancel scheduling of inserts before rescheduling + BS.cancelScheduling(VL, VL0); + UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, Inserts); + Bundle = BS.tryScheduleBundle(Inserts, this, S); + // if (!Bundle) { + // LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); + // assert((!BS.getScheduleData(VL0) || + // !BS.getScheduleData(VL0)->isPartOfBundle()) && + // "tryScheduleBundle should cancelScheduling on failure"); + // newTreeEntry(Inserts, None /*not vectorized*/, S, UserTreeIdx, + // ReuseShuffleIndicies); + // return; + // } + S = getSameOpcode(Inserts); + // Bundle = None; + TreeEntry *TE = newTreeEntry(Inserts, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + // for (Value *V : Inserts) + // Operands.push_back(cast(V)->getOperand(1)); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + return; + + + + // ValueList Inserts; + // ValueList Operands; + // for (Value *V : VL) { + // Inserts.push_back(cast(V)->getOperand(0)); + // Operands.push_back(cast(V)->getOperand(1)); + // } + + // TreeEntry *TE = + // newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + // ReuseShuffleIndicies);//, I->getFirst()); + // TE->setOperandsInOrder(); + // buildTree_rec(Inserts, Depth + 1, {TE, 0}); + // buildTree_rec(Operands, Depth + 1, {TE, 1}); + + // return; + + } + + // LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); + // newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + // ReuseShuffleIndicies); + // BS.cancelScheduling(VL, VL0); + // return; + case Instruction::Load: { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller @@ -3201,18 +3269,24 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + if (isa(VL[0]) || + isa(VL[0]) || + isa(VL[0])) + return 0; + Type *ScalarTy = VL[0]->getType(); + int N = VL.size(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); else if (CmpInst *CI = dyn_cast(VL[0])) ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, N); // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), N); unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3982,6 +4056,15 @@ Type *ScalarTy = S.OpValue->getType(); if (StoreInst *SI = dyn_cast(S.OpValue)) ScalarTy = SI->getValueOperand()->getType(); + if (isa(S.OpValue)) { + ValueList VL2; + VL2.resize(VL.size()); + for (unsigned i = 0; i < VL.size(); ++i) + if (auto *Insrt = dyn_cast(VL[i])) + VL2[i] = Insrt->getOperand(1); + + return vectorizeTree(VL2); + } // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; @@ -4035,13 +4118,18 @@ Instruction *VL0 = E->getMainOp(); Type *ScalarTy = VL0->getType(); + int N = 1; if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + if (auto VectorTy = dyn_cast(ScalarTy)) { + ScalarTy = VectorTy->getElementType(); + N = VectorTy->getNumElements(); + } + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size() * N); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->State == TreeEntry::NeedToGather) { + if (E->State == TreeEntry::NeedToGather && !isa(VL0)) { setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -5245,7 +5333,7 @@ BS->schedule(picked, ReadyInsts); NumToSchedule--; } - assert(NumToSchedule == 0 && "could not schedule all instructions"); + //assert(NumToSchedule == 0 && "could not schedule all instructions"); // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; @@ -5817,9 +5905,10 @@ if (auto *SI = dyn_cast(&I)) { if (!SI->isSimple()) continue; - if (!isValidElementType(SI->getValueOperand()->getType())) - continue; - Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI); + if (isValidElementType(SI->getValueOperand()->getType()) || + isa(SI->getValueOperand()->getType())) + Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI); + continue; } // Ignore getelementptr instructions that have more than one index, a @@ -6960,6 +7049,7 @@ /// \return true if it matches. static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, + SmallVectorImpl *AllInsertInsts, int &UserCost) { assert((isa(LastInsertInst) || isa(LastInsertInst)) && @@ -6967,6 +7057,8 @@ UserCost = 0; do { Value *InsertedOperand; + if (AllInsertInsts) + AllInsertInsts->push_back(LastInsertInst); if (auto *IE = dyn_cast(LastInsertInst)) { InsertedOperand = IE->getOperand(1); LastInsertInst = IE->getOperand(0); @@ -6981,14 +7073,9 @@ } if (isa(InsertedOperand) || isa(InsertedOperand)) { - int TmpUserCost; - SmallVector TmpBuildVectorOpds; - if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, - TmpUserCost)) + if (!findBuildAggregate(InsertedOperand, TTI, BuildVectorOpds, + AllInsertInsts, UserCost)) return false; - BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), - TmpBuildVectorOpds.rend()); - UserCost += TmpUserCost; } else { BuildVectorOpds.push_back(InsertedOperand); } @@ -6999,7 +7086,6 @@ !LastInsertInst->hasOneUse()) return false; } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); return true; } @@ -7170,8 +7256,9 @@ return false; SmallVector BuildVectorOpds; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, nullptr, UserCost)) return false; + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to @@ -7181,13 +7268,14 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { - int UserCost; + int UserCost = 0; SmallVector BuildVectorOpds; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, nullptr, UserCost) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && isShuffle(BuildVectorOpds))) return false; + std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction.