diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8989,6 +8989,78 @@ return OpsChanged; } +template +static bool +tryToVectorizeSequence(SmallVectorImpl &Incoming, + function_ref Limit, + function_ref Comparator, + function_ref AreCompatible, + function_ref, bool)> TryToVectorize, + bool LimitForRegisterSize) { + bool Changed = false; + // Sort by type, parent, operands. + stable_sort(Incoming, Comparator); + + // Try to vectorize elements base on their type. + SmallVector Candidates; + for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { + // Look for the next elements with the same type, parent and operand + // kinds. + auto *SameTypeIt = IncIt; + while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) + ++SameTypeIt; + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" + << NumElts << ")\n"); + // The vectorization is a 3-state attempt: + // 1. Try to vectorize instructions with the same/alternate opcodes with the + // size of maximal register at first. + // 2. Try to vectorize remaining instructions with the same type, if + // possible. This may result in the better vectorization results rather than + // if we try just to vectorize instructions with the same/alternate opcodes. + // 3. Final attempt to try to vectorize all instructions with the + // same/alternate ops only, this may result in some extra final + // vectorization. + if (NumElts > 1 && + TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) { + // Success start over because instructions might have been changed. + Changed = true; + } else if (NumElts < Limit(*IncIt) && + (Candidates.empty() || + Candidates.front()->getType() == (*IncIt)->getType())) { + Candidates.append(IncIt, std::next(IncIt, NumElts)); + } + // Final attempt to vectorize instructions with the same types. + if (Candidates.size() > 1 && + (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { + if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) { + // Success start over because instructions might have been changed. + Changed = true; + } else if (LimitForRegisterSize) { + // Try to vectorize using small vectors. + for (auto *It = Candidates.begin(), *End = Candidates.end(); + It != End;) { + auto *SameTypeIt = It; + while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) + ++SameTypeIt; + unsigned NumElts = (SameTypeIt - It); + if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts), + /*LimitForRegisterSize=*/false)) + Changed = true; + It = SameTypeIt; + } + } + Candidates.clear(); + } + + // Start over at the next instruction of a different type (or the end). + IncIt = SameTypeIt; + } + return Changed; +} + bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; @@ -8997,11 +9069,89 @@ // node. Allows better to identify the chains that can be vectorized in the // better way. DenseMap> PHIToOpcodes; + auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { + assert(isValidElementType(V1->getType()) && + isValidElementType(V2->getType()) && + "Expected vectorizable types only."); + // It is fine to compare type IDs here, since we expect only vectorizable + // types, like ints, floats and pointers, we don't care about other type. + if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) + return true; + if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) + return false; + ArrayRef Opcodes1 = PHIToOpcodes[V1]; + ArrayRef Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() < Opcodes2.size()) + return true; + if (Opcodes1.size() > Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa(Opcodes1[I]) || isa(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast(Opcodes1[I])) + if (auto *I2 = dyn_cast(Opcodes2[I])) { + DomTreeNodeBase *NodeI1 = DT->getNode(I1->getParent()); + DomTreeNodeBase *NodeI2 = DT->getNode(I2->getParent()); + if (!NodeI1) + return NodeI2 != nullptr; + if (!NodeI2) + return false; + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa(Opcodes1[I]) && isa(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) + return true; + if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) + return false; + } + return false; + }; + auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + if (V1 == V2) + return true; + if (V1->getType() != V2->getType()) + return false; + ArrayRef Opcodes1 = PHIToOpcodes[V1]; + ArrayRef Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() != Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa(Opcodes1[I]) || isa(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast(Opcodes1[I])) + if (auto *I2 = dyn_cast(Opcodes2[I])) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + if (isa(Opcodes1[I]) && isa(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) + return false; + } + return true; + }; + auto Limit = [&R](Value *V) { + unsigned EltSize = R.getVectorElementSize(V); + return std::max(2U, R.getMaxVecRegSize() / EltSize); + }; - bool HaveVectorizedPhiNodes = true; - while (HaveVectorizedPhiNodes) { - HaveVectorizedPhiNodes = false; - + bool HaveVectorizedPhiNodes = false; + do { // Collect the incoming values from the PHIs. Incoming.clear(); for (Instruction &I : *BB) { @@ -9039,158 +9189,15 @@ } } - // Sort by type, parent, operands. - stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) { - assert(isValidElementType(V1->getType()) && - isValidElementType(V2->getType()) && - "Expected vectorizable types only."); - // It is fine to compare type IDs here, since we expect only vectorizable - // types, like ints, floats and pointers, we don't care about other type. - if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) - return true; - if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) - return false; - ArrayRef Opcodes1 = PHIToOpcodes[V1]; - ArrayRef Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() < Opcodes2.size()) - return true; - if (Opcodes1.size() > Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa(Opcodes1[I]) || isa(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast(Opcodes1[I])) - if (auto *I2 = dyn_cast(Opcodes2[I])) { - DomTreeNodeBase *NodeI1 = DT->getNode(I1->getParent()); - DomTreeNodeBase *NodeI2 = DT->getNode(I2->getParent()); - if (!NodeI1) - return NodeI2 != nullptr; - if (!NodeI2) - return false; - assert((NodeI1 == NodeI2) == - (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && - "Different nodes should have different DFS numbers"); - if (NodeI1 != NodeI2) - return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return I1->getOpcode() < I2->getOpcode(); - } - if (isa(Opcodes1[I]) && isa(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) - return true; - if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) - return false; - } - return false; - }); - - auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { - if (V1 == V2) - return true; - if (V1->getType() != V2->getType()) - return false; - ArrayRef Opcodes1 = PHIToOpcodes[V1]; - ArrayRef Opcodes2 = PHIToOpcodes[V2]; - if (Opcodes1.size() != Opcodes2.size()) - return false; - for (int I = 0, E = Opcodes1.size(); I < E; ++I) { - // Undefs are compatible with any other value. - if (isa(Opcodes1[I]) || isa(Opcodes2[I])) - continue; - if (auto *I1 = dyn_cast(Opcodes1[I])) - if (auto *I2 = dyn_cast(Opcodes2[I])) { - if (I1->getParent() != I2->getParent()) - return false; - InstructionsState S = getSameOpcode({I1, I2}); - if (S.getOpcode()) - continue; - return false; - } - if (isa(Opcodes1[I]) && isa(Opcodes2[I])) - continue; - if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) - return false; - } - return true; - }; - - // Try to vectorize elements base on their type. - SmallVector Candidates; - for (SmallVector::iterator IncIt = Incoming.begin(), - E = Incoming.end(); - IncIt != E;) { - - // Look for the next elements with the same type, parent and operand - // kinds. - SmallVector::iterator SameTypeIt = IncIt; - while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) { - VisitedInstrs.insert(*SameTypeIt); - ++SameTypeIt; - } - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" - << NumElts << ")\n"); - // The order in which the phi nodes appear in the program does not matter. - // So allow tryToVectorizeList to reorder them if it is beneficial. This - // is done when there are exactly two elements since tryToVectorizeList - // asserts that there are only two values when AllowReorder is true. - // The vectorization is a 3-state attempt: - // 1. Try to vectorize PHIs with the same/alternate opcodes with the size - // of maximal register at first. - // 2. Try to vectorize remaining PHIs with the same type, if possible. - // This may result in the better vectorization results rather than if we - // try just to vectorize PHIs with the same/alternate opcodes. - // 3. Final attempt to try to vectorize all PHIs with the same/alternate - // ops only, this may result in some extra final vectorization. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*LimitForRegisterSize=*/true)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } else if (NumElts * R.getVectorElementSize(*IncIt) < - R.getMaxVecRegSize() && - (Candidates.empty() || - Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); - } - // Final attempt to vectorize phis with the same types. - if (Candidates.size() > 1 && - (SameTypeIt == E || - (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (tryToVectorizeList(Candidates, R)) { - // Success start over because instructions might have been changed. - HaveVectorizedPhiNodes = true; - Changed = true; - } else { - // Try to vectorize using small vectors. - for (SmallVector::iterator It = Candidates.begin(), - End = Candidates.end(); - It != End;) { - SmallVector::iterator SameTypeIt = It; - while (SameTypeIt != End && AreCompatiblePHIs(*SameTypeIt, *It)) - ++SameTypeIt; - unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && - tryToVectorizeList(makeArrayRef(It, NumElts), R)) { - HaveVectorizedPhiNodes = true; - Changed = true; - } - It = SameTypeIt; - } - } - Candidates.clear(); - } - - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } - } + HaveVectorizedPhiNodes = tryToVectorizeSequence( + Incoming, Limit, PHICompare, AreCompatiblePHIs, + [this, &R](ArrayRef Candidates, bool LimitForRegisterSize) { + return tryToVectorizeList(Candidates, R, LimitForRegisterSize); + }, + /*LimitForRegisterSize=*/true); + Changed |= HaveVectorizedPhiNodes; + VisitedInstrs.insert(Incoming.begin(), Incoming.end()); + } while (HaveVectorizedPhiNodes); VisitedInstrs.clear(); @@ -9443,6 +9450,10 @@ return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; + auto Limit = [&R, this](StoreInst *SI) { + unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); + return R.getMinVF(EltSize); + }; // Attempt to sort and vectorize each of the store-groups. for (auto &Pair : Stores) { @@ -9452,33 +9463,15 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Pair.second.size() << ".\n"); - stable_sort(Pair.second, StoreSorter); - - // Try to vectorize elements based on their compatibility. - for (ArrayRef::iterator IncIt = Pair.second.begin(), - E = Pair.second.end(); - IncIt != E;) { - - // Look for the next elements with the same type. - ArrayRef::iterator SameTypeIt = IncIt; - Type *EltTy = (*IncIt)->getPointerOperand()->getType(); - - while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) - ++SameTypeIt; - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores (" - << NumElts << ")\n"); - if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && - vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { - // Success start over because instructions might have been changed. - Changed = true; - } + if (!isValidElementType(Pair.second.front()->getValueOperand()->getType())) + continue; - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; - } + Changed |= tryToVectorizeSequence( + Pair.second, Limit, StoreSorter, AreCompatibleStores, + [this, &R](ArrayRef Candidates, bool) { + return vectorizeStores(Candidates, R); + }, + /*LimitForRegisterSize=*/false); } return Changed; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -313,21 +313,18 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 undef to i16 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i64 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR0]], align 16 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 1 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR1]], align 4 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 2 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR2]], align 8 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 3 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 4 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR4]], align 16 ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 5 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR5]], align 4 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 6 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[PTR6]], align 8 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 7 -; CHECK-NEXT: store i16 [[TMP2]], i16* [[PTR7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[PTR0]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; %1 = load i16, i16* %v1, align 4