diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -147,6 +147,20 @@ "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + +// The Look-ahead heuristic goes through the users of the bundle to calculate +// the users cost in getExternalUsesCost(). To avoid compilation time increase +// we limit the number of users visited to this value. +static cl::opt LookAheadUsersBudget( + "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, + cl::desc("The maximum number of users to visit while visiting the " + "predecessors. This prevents compilation time increase.")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -721,6 +735,7 @@ const DataLayout &DL; ScalarEvolution &SE; + const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -746,6 +761,227 @@ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// ExtractElementInst from same vector and consecutive indexes. + static const int ScoreConsecutiveExtracts = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + /// User exteranl to the vectorized code. + static const int ExternalUseCost = 1; + /// The user is internal but in a different lane. + static const int UserInDiffLaneCost = ExternalUseCost; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast(V1); + auto *LI2 = dyn_cast(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast(V1); + auto *C2 = dyn_cast(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + // Extracts from consecutive indexes of the same vector better score as + // the extracts could be optimized away. + auto *Ex1 = dyn_cast(V1); + auto *Ex2 = dyn_cast(V2); + if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && + cast(Ex1->getIndexOperand())->getZExtValue() + 1 == + cast(Ex2->getIndexOperand())->getZExtValue()) { + return VLOperands::ScoreConsecutiveExtracts; + } + + auto *I1 = dyn_cast(V1); + auto *I2 = dyn_cast(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + // Note: Only consider instructions with <= 2 operands to avoid + // complexity explosion. + if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Holds the values and their lane that are taking part in the look-ahead + /// score calculation. This is used in the external uses cost calculation. + SmallDenseMap InLookAheadValues; + + /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// either external to the vectorized code, or require shuffling. + int getExternalUsesCost(const std::pair &LHS, + const std::pair &RHS) { + int Cost = 0; + SmallVector, 2> Values = {LHS, RHS}; + for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { + Value *V = Values[Idx].first; + // Calculate the absolute lane, using the minimum relative lane of LHS + // and RHS as base and Idx as the offset. + int Ln = std::min(LHS.second, RHS.second) + Idx; + assert(Ln >= 0 && "Bad lane calculation"); + unsigned UsersBudget = LookAheadUsersBudget; + for (User *U : V->users()) { + if (const TreeEntry *UserTE = R.getTreeEntry(U)) { + // The user is in the VectorizableTree. Check if we need to insert. + auto It = llvm::find(UserTE->Scalars, U); + assert(It != UserTE->Scalars.end() && "U is in UserTE"); + int UserLn = std::distance(UserTE->Scalars.begin(), It); + assert(UserLn >= 0 && "Bad lane"); + if (UserLn != Ln) + Cost += UserInDiffLaneCost; + } else { + // Check if the user is in the look-ahead code. + auto It2 = InLookAheadValues.find(U); + if (It2 != InLookAheadValues.end()) { + // The user is in the look-ahead code. Check the lane. + if (It2->second != Ln) + Cost += UserInDiffLaneCost; + } else { + // The user is neither in SLP tree nor in the look-ahead code. + Cost += ExternalUseCost; + } + } + // Limit the number of visited uses to cap compilation time. + if (--UsersBudget == 0) + break; + } + } + return Cost; + } + + /// Go through the operands of \p LHS and \p RHS recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// Luís F. W. Góes + int getScoreAtLevelRec(const std::pair &LHS, + const std::pair &RHS, int CurrLevel, + int MaxLevel) { + + Value *V1 = LHS.first; + Value *V2 = RHS.first; + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = + std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - + getExternalUsesCost(LHS, RHS)); + int Lane1 = LHS.second; + int Lane2 = RHS.second; + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast(V1); + auto *I2 = dyn_cast(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa(I1) && isa(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Keep track of in-tree values for determining the external-use cost. + InLookAheadValues[V1] = Lane1; + InLookAheadValues[V2] = Lane2; + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + unsigned MaxOpIdx2 = 0; + bool FoundBest = false; + // If I2 is commutative try all combinations. + unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; + unsigned ToIdx = isCommutative(I2) + ? I2->getNumOperands() + : std::min(I2->getNumOperands(), OpIdx1 + 1); + assert(FromIdx <= ToIdx && "Bad index"); + for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, + {I2->getOperand(OpIdx2), Lane2}, + CurrLevel + 1, MaxLevel); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + FoundBest = true; + } + } + if (FoundBest) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \Returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. This helps break ties in an informed way when we cannot decide on + /// the order of the operands by just considering the immediate + /// predecessors. + int getLookAheadScore(const std::pair &LHS, + const std::pair &RHS) { + InLookAheadValues.clear(); + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -763,9 +999,6 @@ // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; - const unsigned BestScore = 2; - const unsigned GoodScore = 1; - // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -794,41 +1027,19 @@ // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - if (isa(Op)) { - // Figure out which is left and right, so that we can check for - // consecutive loads - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - if (isConsecutiveAccess(cast(OpLeft), - cast(OpRight), DL, SE)) - BestOp.Idx = Idx; - } - break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa(Op) && isa(OpLastLane) && - cast(Op)->getOpcode() == - cast(OpLastLane)->getOpcode()) || - (isa(OpLastLane) && isa(Op)) || - isa(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; case ReorderingMode::Constant: - if (isa(Op)) { - unsigned Score = (isa(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } + case ReorderingMode::Opcode: { + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + unsigned Score = + getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; } break; + } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -959,8 +1170,8 @@ public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef RootVL, const DataLayout &DL, - ScalarEvolution &SE) - : DL(DL), SE(SE) { + ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1189,7 +1400,8 @@ SmallVectorImpl &Left, SmallVectorImpl &Right, const DataLayout &DL, - ScalarEvolution &SE); + ScalarEvolution &SE, + const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2550,7 +2762,7 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2597,7 +2809,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2789,7 +3001,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -3550,13 +3762,15 @@ // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef VL, SmallVectorImpl &Left, - SmallVectorImpl &Right, const DataLayout &DL, - ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right, + const DataLayout &DL, + ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,19 +6,16 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0 -; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1 -; CHECK-NEXT: ret <2 x i64> [[TMP3_1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <2 x i64> [[TMP9]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -77,22 +74,16 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: ret <4 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -123,18 +114,16 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -240,28 +229,22 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i32> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) -; CHECK-NEXT: ret i32 [[TMP17]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: ret i32 [[TMP15]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -27,22 +27,19 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8 -; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8 -; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8 -; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8 -; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8 -; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 -; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 -; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]] -; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] -; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -164,22 +161,23 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8 -; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8 -; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8 -; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8 -; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8 -; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 -; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 -; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]] -; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] -; CHECK-NEXT: [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]] -; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -239,6 +237,97 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 +; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: ret void +; +entry: + %IdxA0 = getelementptr inbounds double, double* %A, i64 0 + %IdxB0 = getelementptr inbounds double, double* %B, i64 0 + %IdxC0 = getelementptr inbounds double, double* %C, i64 0 + %IdxD0 = getelementptr inbounds double, double* %D, i64 0 + + %IdxA1 = getelementptr inbounds double, double* %A, i64 1 + %IdxB2 = getelementptr inbounds double, double* %B, i64 2 + %IdxA2 = getelementptr inbounds double, double* %A, i64 2 + %IdxB1 = getelementptr inbounds double, double* %B, i64 1 + + %A0 = load double, double *%IdxA0, align 8 + %B0 = load double, double *%IdxB0, align 8 + %C0 = load double, double *%IdxC0, align 8 + %D0 = load double, double *%IdxD0, align 8 + + %A1 = load double, double *%IdxA1, align 8 + %B2 = load double, double *%IdxB2, align 8 + %A2 = load double, double *%IdxA2, align 8 + %B1 = load double, double *%IdxB1, align 8 + + %subA0B0 = fsub fast double %A0, %B0 + %subC0D0 = fsub fast double %C0, %D0 + + %subA1B2 = fsub fast double %A1, %B2 + %subA2B1 = fsub fast double %A2, %B1 + + %add0 = fadd fast double %subA0B0, %subC0D0 + %add1 = fadd fast double %subA1B2, %subA2B1 + + %IdxS0 = getelementptr inbounds double, double* %S, i64 0 + %IdxS1 = getelementptr inbounds double, double* %S, i64 1 + + store double %add0, double *%IdxS0, align 8 + store double %add1, double *%IdxS1, align 8 + + ; External use + store double %A1, double *%Ext1, align 8 + ret void +} + +; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] +; \ / \ / / \ / \ / \ +; - - U1,U2,U3 - - U4,U5 +; \ / \ / +; + + +; | | +; S[0] S[1] +; +; +; If we limit the users budget for the look-ahead heuristic to 2, then the +; look-ahead heuristic has no way of choosing B[1] (with 2 external users) +; over A[1] (with 3 external users). +; The result is that the operands are of the Add not reordered and the loads +; from A get vectorized instead of the loads from B. +; +define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) { +; CHECK-LABEL: @lookahead_limit_users_budget( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 +; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 +; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 +; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 +; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 +; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 @@ -262,6 +351,10 @@ ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 +; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -300,7 +393,104 @@ store double %add0, double *%IdxS0, align 8 store double %add1, double *%IdxS1, align 8 - ; External use + ; External uses of A1 store double %A1, double *%Ext1, align 8 + store double %A1, double *%Ext2, align 8 + store double %A1, double *%Ext3, align 8 + + ; External uses of B1 + store double %B1, double *%Ext4, align 8 + store double %B1, double *%Ext5, align 8 + + ret void +} + +; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls). + +%Class = type { i8 } +declare double @_ZN1i2ayEv(%Class*) +declare double @_ZN1i2axEv() + +define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) { +; CHECK-LABEL: @lookahead_crash( +; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]]) +; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv() +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: ret void +; + %IdxA0 = getelementptr inbounds double, double* %A, i64 0 + %IdxA1 = getelementptr inbounds double, double* %A, i64 1 + + %A0 = load double, double *%IdxA0, align 8 + %A1 = load double, double *%IdxA1, align 8 + + %C0 = call double @_ZN1i2ayEv(%Class *%Arg0) + %C1 = call double @_ZN1i2axEv() + + %add0 = fadd fast double %A0, %C0 + %add1 = fadd fast double %A1, %C1 + + %IdxS0 = getelementptr inbounds double, double* %S, i64 0 + %IdxS1 = getelementptr inbounds double, double* %S, i64 1 + store double %add0, double *%IdxS0, align 8 + store double %add1, double *%IdxS1, align 8 + ret void +} + +; This checks that we choose to group consecutive extracts from the same vectors. +define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2) { +; CHECK-LABEL: @ChecksExtractScores( +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 +; CHECK-NEXT: [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4 +; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 +; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 +; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[LOADA0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[LOADA1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: ret void +; + %idx0 = getelementptr inbounds double, double* %array, i64 0 + %idx1 = getelementptr inbounds double, double* %array, i64 1 + %loadA0 = load double, double* %idx0, align 4 + %loadA1 = load double, double* %idx1, align 4 + + %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4 + %extrA0 = extractelement <2 x double> %loadVec, i32 0 + %extrA1 = extractelement <2 x double> %loadVec, i32 1 + %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4 + %extrB0 = extractelement <2 x double> %loadVec2, i32 0 + %extrB1 = extractelement <2 x double> %loadVec2, i32 1 + + %mul0 = fmul double %extrA0, %loadA0 + %mul1 = fmul double %extrA1, %loadA0 + %mul3 = fmul double %extrB0, %loadA1 + %mul4 = fmul double %extrB1, %loadA1 + %add0 = fadd double %mul0, %mul3 + %add1 = fadd double %mul1, %mul4 + + %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0 + %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1 + store double %add0, double *%sidx0, align 8 + store double %add1, double *%sidx1, align 8 ret void }