diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1034,96 +1034,18 @@ #endif }; - /// A helper data structure to hold the operands of a vector of instructions. - /// This supports a fixed vector length for all operand vectors. - class VLOperands { - /// For each operand we need (i) the value, and (ii) the opcode that it - /// would be attached to if the expression was in a left-linearized form. - /// This is required to avoid illegal operand reordering. - /// For example: - /// \verbatim - /// 0 Op1 - /// |/ - /// Op1 Op2 Linearized + Op2 - /// \ / ----------> |/ - /// - - - /// - /// Op1 - Op2 (0 + Op1) - Op2 - /// \endverbatim - /// - /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. - /// - /// Another way to think of this is to track all the operations across the - /// path from the operand all the way to the root of the tree and to - /// calculate the operation that corresponds to this path. For example, the - /// path from Op2 to the root crosses the RHS of the '-', therefore the - /// corresponding operation is a '-' (which matches the one in the - /// linearized tree, as shown above). - /// - /// For lack of a better term, we refer to this operation as Accumulated - /// Path Operation (APO). - struct OperandData { - OperandData() = default; - OperandData(Value *V, bool APO, bool IsUsed) - : V(V), APO(APO), IsUsed(IsUsed) {} - /// The operand value. - Value *V = nullptr; - /// TreeEntries only allow a single opcode, or an alternate sequence of - /// them (e.g, +, -). Therefore, we can safely use a boolean value for the - /// APO. It is set to 'true' if 'V' is attached to an inverse operation - /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise - /// (e.g., Add/Mul) - bool APO = false; - /// Helper data for the reordering function. - bool IsUsed = false; - }; - - /// During operand reordering, we are trying to select the operand at lane - /// that matches best with the operand at the neighboring lane. Our - /// selection is based on the type of value we are looking for. For example, - /// if the neighboring lane has a load, we need to look for a load that is - /// accessing a consecutive address. These strategies are summarized in the - /// 'ReorderingMode' enumerator. - enum class ReorderingMode { - Load, ///< Matching loads to consecutive memory addresses - Opcode, ///< Matching instructions based on opcode (same or alternate) - Constant, ///< Matching constants - Splat, ///< Matching the same instruction multiple times (broadcast) - Failed, ///< We failed to create a vectorizable group - }; - - using OperandDataVec = SmallVector; - - /// A vector of operand vectors. - SmallVector OpsVec; - + /// A helper class used for scoring candidates for two consecutive lanes. + class LookAheadHeuristics { const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; + int NumLanes; // Total number of lanes (aka vectorization factor). + int MaxLevel; // The maximum recursion depth for accumulating score. - /// \returns the operand data at \p OpIdx and \p Lane. - OperandData &getData(unsigned OpIdx, unsigned Lane) { - return OpsVec[OpIdx][Lane]; - } - - /// \returns the operand data at \p OpIdx and \p Lane. Const version. - const OperandData &getData(unsigned OpIdx, unsigned Lane) const { - return OpsVec[OpIdx][Lane]; - } - - /// Clears the used flag for all entries. - void clearUsed() { - for (unsigned OpIdx = 0, NumOperands = getNumOperands(); - OpIdx != NumOperands; ++OpIdx) - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) - OpsVec[OpIdx][Lane].IsUsed = false; - } - - /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. - void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { - std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); - } + public: + LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE, + const BoUpSLP &R, int NumLanes, int MaxLevel) + : DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {} // The hard-coded scores listed here are not very important, though it shall // be higher for better matches to improve the resulting cost. When @@ -1169,8 +1091,7 @@ /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p /// MainAltOps. int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, - const DataLayout &DL, ScalarEvolution &SE, int NumLanes, - ArrayRef MainAltOps) { + ArrayRef MainAltOps) const { if (V1 == V2) { if (isa(V1)) { // Retruns true if the users of V1 and V2 won't need to be extracted. @@ -1192,37 +1113,37 @@ ElementCount::getFixed(NumLanes)) && ((int)V1->getNumUses() == NumLanes || AllUsersAreInternal(V1, V2))) - return VLOperands::ScoreSplatLoads; + return LookAheadHeuristics::ScoreSplatLoads; } - return VLOperands::ScoreSplat; + return LookAheadHeuristics::ScoreSplat; } auto *LI1 = dyn_cast(V1); auto *LI2 = dyn_cast(V2); if (LI1 && LI2) { if (LI1->getParent() != LI2->getParent()) - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; Optional Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); if (!Dist || *Dist == 0) - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; // The distance is too large - still may be profitable to use masked // loads/gathers. if (std::abs(*Dist) > NumLanes / 2) - return VLOperands::ScoreAltOpcodes; + return LookAheadHeuristics::ScoreAltOpcodes; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce // better results. It should not affect current vectorization. - return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreReversedLoads; + return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads + : LookAheadHeuristics::ScoreReversedLoads; } auto *C1 = dyn_cast(V1); auto *C2 = dyn_cast(V2); if (C1 && C2) - return VLOperands::ScoreConstants; + return LookAheadHeuristics::ScoreConstants; // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. @@ -1231,7 +1152,7 @@ if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { // Undefs are always profitable for extractelements. if (isa(V2)) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; Value *EV2 = nullptr; ConstantInt *Ex2Idx = nullptr; if (match(V2, @@ -1239,9 +1160,9 @@ m_Undef())))) { // Undefs are always profitable for extractelements. if (!Ex2Idx) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) - return VLOperands::ScoreConsecutiveExtracts; + return LookAheadHeuristics::ScoreConsecutiveExtracts; if (EV2 == EV1) { int Idx1 = Ex1Idx->getZExtValue(); int Idx2 = Ex2Idx->getZExtValue(); @@ -1249,22 +1170,22 @@ // The distance is too large - still may be profitable to use // shuffles. if (std::abs(Dist) == 0) - return VLOperands::ScoreSplat; + return LookAheadHeuristics::ScoreSplat; if (std::abs(Dist) > NumLanes / 2) - return VLOperands::ScoreSameOpcode; - return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts - : VLOperands::ScoreReversedExtracts; + return LookAheadHeuristics::ScoreSameOpcode; + return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts + : LookAheadHeuristics::ScoreReversedExtracts; } - return VLOperands::ScoreAltOpcodes; + return LookAheadHeuristics::ScoreAltOpcodes; } - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; } auto *I1 = dyn_cast(V1); auto *I2 = dyn_cast(V2); if (I1 && I2) { if (I1->getParent() != I2->getParent()) - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; SmallVector Ops(MainAltOps.begin(), MainAltOps.end()); Ops.push_back(I1); Ops.push_back(I2); @@ -1278,77 +1199,17 @@ return cast(V)->getNumOperands() == S.MainOp->getNumOperands(); })) - return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes - : VLOperands::ScoreSameOpcode; + return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes + : LookAheadHeuristics::ScoreSameOpcode; } if (isa(V2)) - return VLOperands::ScoreUndef; + return LookAheadHeuristics::ScoreUndef; - return VLOperands::ScoreFail; + return LookAheadHeuristics::ScoreFail; } - /// \param Lane lane of the operands under analysis. - /// \param OpIdx operand index in \p Lane lane we're looking the best - /// candidate for. - /// \param Idx operand index of the current candidate value. - /// \returns The additional score due to possible broadcasting of the - /// elements in the lane. It is more profitable to have power-of-2 unique - /// elements in the lane, it will be vectorized with higher probability - /// after removing duplicates. Currently the SLP vectorizer supports only - /// vectorization of the power-of-2 number of unique scalars. - int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { - Value *IdxLaneV = getData(Idx, Lane).V; - if (!isa(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) - return 0; - SmallPtrSet Uniques; - for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { - if (Ln == Lane) - continue; - Value *OpIdxLnV = getData(OpIdx, Ln).V; - if (!isa(OpIdxLnV)) - return 0; - Uniques.insert(OpIdxLnV); - } - int UniquesCount = Uniques.size(); - int UniquesCntWithIdxLaneV = - Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; - Value *OpIdxLaneV = getData(OpIdx, Lane).V; - int UniquesCntWithOpIdxLaneV = - Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; - if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) - return 0; - return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - - UniquesCntWithOpIdxLaneV) - - (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); - } - - /// \param Lane lane of the operands under analysis. - /// \param OpIdx operand index in \p Lane lane we're looking the best - /// candidate for. - /// \param Idx operand index of the current candidate value. - /// \returns The additional score for the scalar which users are all - /// vectorized. - int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { - Value *IdxLaneV = getData(Idx, Lane).V; - Value *OpIdxLaneV = getData(OpIdx, Lane).V; - // Do not care about number of uses for vector-like instructions - // (extractelement/extractvalue with constant indices), they are extracts - // themselves and already externally used. Vectorization of such - // instructions does not add extra extractelement instruction, just may - // remove it. - if (isVectorLikeInstWithConstOps(IdxLaneV) && - isVectorLikeInstWithConstOps(OpIdxLaneV)) - return VLOperands::ScoreAllUserVectorized; - auto *IdxLaneI = dyn_cast(IdxLaneV); - if (!IdxLaneI || !isa(OpIdxLaneV)) - return 0; - return R.areAllUsersVectorized(IdxLaneI, None) - ? VLOperands::ScoreAllUserVectorized - : 0; - } - - /// Go through the operands of \p LHS and \p RHS recursively until \p + /// Go through the operands of \p LHS and \p RHS recursively until /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands /// of \p U1 and \p U2), except at the beginning of the recursion where @@ -1365,8 +1226,8 @@ /// each level recursively, accumulating the score. It starts from matching /// the additions at level 0, then moves on to the loads (level 1). The /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and - /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while - /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. /// Please note that the order of the operands does not matter, as we /// evaluate the score of all profitable combinations of operands. In /// other words the score of G1 and G4 is the same as G1 and G2. This @@ -1375,12 +1236,12 @@ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, /// Luís F. W. Góes int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, - Instruction *U2, int CurrLevel, int MaxLevel, - ArrayRef MainAltOps) { + Instruction *U2, int CurrLevel, + ArrayRef MainAltOps) const { // Get the shallow score of V1 and V2. int ShallowScoreAtThisLevel = - getShallowScore(LHS, RHS, U1, U2, DL, SE, getNumLanes(), MainAltOps); + getShallowScore(LHS, RHS, U1, U2, MainAltOps); // If reached MaxLevel, // or if V1 and V2 are not instructions, @@ -1391,7 +1252,7 @@ auto *I1 = dyn_cast(LHS); auto *I2 = dyn_cast(RHS); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || - ShallowScoreAtThisLevel == VLOperands::ScoreFail || + ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || (((isa(I1) && isa(I2)) || (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || (isa(I1) && isa(I2))) && @@ -1423,9 +1284,10 @@ // Recursively calculate the cost at each level int TmpScore = getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), - I1, I2, CurrLevel + 1, MaxLevel, None); + I1, I2, CurrLevel + 1, None); // Look for the best score. - if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + if (TmpScore > LookAheadHeuristics::ScoreFail && + TmpScore > MaxTmpScore) { MaxTmpScore = TmpScore; MaxOpIdx2 = OpIdx2; FoundBest = true; @@ -1439,6 +1301,157 @@ } return ShallowScoreAtThisLevel; } + }; + /// A helper data structure to hold the operands of a vector of instructions. + /// This supports a fixed vector length for all operand vectors. + class VLOperands { + /// For each operand we need (i) the value, and (ii) the opcode that it + /// would be attached to if the expression was in a left-linearized form. + /// This is required to avoid illegal operand reordering. + /// For example: + /// \verbatim + /// 0 Op1 + /// |/ + /// Op1 Op2 Linearized + Op2 + /// \ / ----------> |/ + /// - - + /// + /// Op1 - Op2 (0 + Op1) - Op2 + /// \endverbatim + /// + /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. + /// + /// Another way to think of this is to track all the operations across the + /// path from the operand all the way to the root of the tree and to + /// calculate the operation that corresponds to this path. For example, the + /// path from Op2 to the root crosses the RHS of the '-', therefore the + /// corresponding operation is a '-' (which matches the one in the + /// linearized tree, as shown above). + /// + /// For lack of a better term, we refer to this operation as Accumulated + /// Path Operation (APO). + struct OperandData { + OperandData() = default; + OperandData(Value *V, bool APO, bool IsUsed) + : V(V), APO(APO), IsUsed(IsUsed) {} + /// The operand value. + Value *V = nullptr; + /// TreeEntries only allow a single opcode, or an alternate sequence of + /// them (e.g, +, -). Therefore, we can safely use a boolean value for the + /// APO. It is set to 'true' if 'V' is attached to an inverse operation + /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise + /// (e.g., Add/Mul) + bool APO = false; + /// Helper data for the reordering function. + bool IsUsed = false; + }; + + /// During operand reordering, we are trying to select the operand at lane + /// that matches best with the operand at the neighboring lane. Our + /// selection is based on the type of value we are looking for. For example, + /// if the neighboring lane has a load, we need to look for a load that is + /// accessing a consecutive address. These strategies are summarized in the + /// 'ReorderingMode' enumerator. + enum class ReorderingMode { + Load, ///< Matching loads to consecutive memory addresses + Opcode, ///< Matching instructions based on opcode (same or alternate) + Constant, ///< Matching constants + Splat, ///< Matching the same instruction multiple times (broadcast) + Failed, ///< We failed to create a vectorizable group + }; + + using OperandDataVec = SmallVector; + + /// A vector of operand vectors. + SmallVector OpsVec; + + const DataLayout &DL; + ScalarEvolution &SE; + const BoUpSLP &R; + + /// \returns the operand data at \p OpIdx and \p Lane. + OperandData &getData(unsigned OpIdx, unsigned Lane) { + return OpsVec[OpIdx][Lane]; + } + + /// \returns the operand data at \p OpIdx and \p Lane. Const version. + const OperandData &getData(unsigned OpIdx, unsigned Lane) const { + return OpsVec[OpIdx][Lane]; + } + + /// Clears the used flag for all entries. + void clearUsed() { + for (unsigned OpIdx = 0, NumOperands = getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; + ++Lane) + OpsVec[OpIdx][Lane].IsUsed = false; + } + + /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. + void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { + std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); + } + + /// \param Lane lane of the operands under analysis. + /// \param OpIdx operand index in \p Lane lane we're looking the best + /// candidate for. + /// \param Idx operand index of the current candidate value. + /// \returns The additional score due to possible broadcasting of the + /// elements in the lane. It is more profitable to have power-of-2 unique + /// elements in the lane, it will be vectorized with higher probability + /// after removing duplicates. Currently the SLP vectorizer supports only + /// vectorization of the power-of-2 number of unique scalars. + int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { + Value *IdxLaneV = getData(Idx, Lane).V; + if (!isa(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V) + return 0; + SmallPtrSet Uniques; + for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) { + if (Ln == Lane) + continue; + Value *OpIdxLnV = getData(OpIdx, Ln).V; + if (!isa(OpIdxLnV)) + return 0; + Uniques.insert(OpIdxLnV); + } + int UniquesCount = Uniques.size(); + int UniquesCntWithIdxLaneV = + Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1; + Value *OpIdxLaneV = getData(OpIdx, Lane).V; + int UniquesCntWithOpIdxLaneV = + Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1; + if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) + return 0; + return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) - + UniquesCntWithOpIdxLaneV) - + (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); + } + + /// \param Lane lane of the operands under analysis. + /// \param OpIdx operand index in \p Lane lane we're looking the best + /// candidate for. + /// \param Idx operand index of the current candidate value. + /// \returns The additional score for the scalar which users are all + /// vectorized. + int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { + Value *IdxLaneV = getData(Idx, Lane).V; + Value *OpIdxLaneV = getData(OpIdx, Lane).V; + // Do not care about number of uses for vector-like instructions + // (extractelement/extractvalue with constant indices), they are extracts + // themselves and already externally used. Vectorization of such + // instructions does not add extra extractelement instruction, just may + // remove it. + if (isVectorLikeInstWithConstOps(IdxLaneV) && + isVectorLikeInstWithConstOps(OpIdxLaneV)) + return LookAheadHeuristics::ScoreAllUserVectorized; + auto *IdxLaneI = dyn_cast(IdxLaneV); + if (!IdxLaneI || !isa(OpIdxLaneV)) + return 0; + return R.areAllUsersVectorized(IdxLaneI, None) + ? LookAheadHeuristics::ScoreAllUserVectorized + : 0; + } /// Score scaling factor for fully compatible instructions but with /// different number of external uses. Allows better selection of the @@ -1453,10 +1466,13 @@ int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef MainAltOps, int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed) { + LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(), + LookAheadMaxDepth); // Keep track of the instruction stack as we recurse into the operands // during the look-ahead score exploration. - int Score = getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, - 1, LookAheadMaxDepth, MainAltOps); + int Score = + LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, + /*CurrLevel=*/1, MainAltOps); if (Score) { int SplatScore = getSplatScore(Lane, OpIdx, Idx); if (Score <= -SplatScore) {