Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -143,6 +143,12 @@ "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -724,6 +730,142 @@ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast(V1); + auto *LI2 = dyn_cast(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast(V1); + auto *C2 = dyn_cast(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + auto *I1 = dyn_cast(V1); + auto *I2 = dyn_cast(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Go through the operands of \p V1 and \p V2 recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// Luís F. W. Góes + static int getScoreAtLevelRec(Value *V1, Value *V2, int CurrLevel, + int MaxLevel, const DataLayout &DL, + ScalarEvolution &SE) { + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = getShallowScore(V1, V2, DL, SE); + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast(V1); + auto *I2 = dyn_cast(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa(I1) && isa(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + int MaxOpIdx2 = -1; + for (int OpIdx2 = 0, NumOperands2 = I2->getNumOperands(); + OpIdx2 != NumOperands2; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = + getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2), + CurrLevel + 1, MaxLevel, DL, SE); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + } + } + if (MaxOpIdx2 >= 0) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. + static int getLookAheadScore(Value *LHS, Value *RHS, const DataLayout &DL, + ScalarEvolution &SE) { + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth, DL, SE); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -783,21 +925,14 @@ BestOp.Idx = Idx; } break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa(Op) && isa(OpLastLane) && - cast(Op)->getOpcode() == - cast(OpLastLane)->getOpcode()) || - (isa(OpLastLane) && isa(Op)) || - isa(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; + case ReorderingMode::Opcode: { + unsigned Score = getLookAheadScore(OpLastLane, Op, DL, SE); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + break; + } case ReorderingMode::Constant: if (isa(Op)) { unsigned Score = (isa(Op)) ? GoodScore : BestScore; Index: test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/lookahead.ll +++ test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -1,7 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s ; -; This checks the look-ahead operand reordering heuristic +; This file tests the look-ahead operand reordering heuristic. +; +; +; This checks that operand reordering will reorder the operands of the adds +; by taking into consideration the instructions beyond the immediate +; predecessors. ; ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] ; \ / \ / \ / \ / @@ -11,8 +16,8 @@ ; | | ; S[0] S[1] ; -define void @test(double* %array) { -; CHECK-LABEL: @test( +define void @lookahead_basic(double* %array) { +; CHECK-LABEL: @lookahead_basic( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 @@ -22,22 +27,19 @@ ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8 -; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8 -; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8 -; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8 -; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8 -; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 -; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 -; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]] -; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] -; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -72,3 +74,141 @@ store double %addCDAB_1, double *%idx1, align 8 ret void } + + +; Check whether the look-ahead operand reordering heuristic will avoid +; bundling the alt opcodes. The vectorized code should have no shuffles. +; +; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1] +; \ / \ / \ / \ / +; + - - + +; \ / \ / +; + + +; | | +; S[0] S[1] +; +define void @lookahead_alt1(double* %array) { +; CHECK-LABEL: @lookahead_alt1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3 +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 +; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 +; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: ret void +; +entry: + %idx0 = getelementptr inbounds double, double* %array, i64 0 + %idx1 = getelementptr inbounds double, double* %array, i64 1 + %idx2 = getelementptr inbounds double, double* %array, i64 2 + %idx3 = getelementptr inbounds double, double* %array, i64 3 + %idx4 = getelementptr inbounds double, double* %array, i64 4 + %idx5 = getelementptr inbounds double, double* %array, i64 5 + %idx6 = getelementptr inbounds double, double* %array, i64 6 + %idx7 = getelementptr inbounds double, double* %array, i64 7 + + %A_0 = load double, double *%idx0, align 8 + %A_1 = load double, double *%idx1, align 8 + %B_0 = load double, double *%idx2, align 8 + %B_1 = load double, double *%idx3, align 8 + + %addAB_0_L = fadd fast double %A_0, %B_0 + %subAB_0_R = fsub fast double %A_0, %B_0 + + %subAB_1_L = fsub fast double %A_1, %B_1 + %addAB_1_R = fadd fast double %A_1, %B_1 + + %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R + %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R + + store double %addABCD_0, double *%idx0, align 8 + store double %addCDAB_1, double *%idx1, align 8 + ret void +} + + +; This code should get vectorized all the way to the loads with shuffles for +; the alt opcodes. +; +; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] +; \ / \ / \ / \ / +; + - + - +; \ / \ / +; + + +; | | +; S[0] S[1] +; +define void @lookahead_alt2(double* %array) { +; CHECK-LABEL: @lookahead_alt2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2 +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3 +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4 +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 +; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 +; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: ret void +; +entry: + %idx0 = getelementptr inbounds double, double* %array, i64 0 + %idx1 = getelementptr inbounds double, double* %array, i64 1 + %idx2 = getelementptr inbounds double, double* %array, i64 2 + %idx3 = getelementptr inbounds double, double* %array, i64 3 + %idx4 = getelementptr inbounds double, double* %array, i64 4 + %idx5 = getelementptr inbounds double, double* %array, i64 5 + %idx6 = getelementptr inbounds double, double* %array, i64 6 + %idx7 = getelementptr inbounds double, double* %array, i64 7 + + %A_0 = load double, double *%idx0, align 8 + %A_1 = load double, double *%idx1, align 8 + %B_0 = load double, double *%idx2, align 8 + %B_1 = load double, double *%idx3, align 8 + %C_0 = load double, double *%idx4, align 8 + %C_1 = load double, double *%idx5, align 8 + %D_0 = load double, double *%idx6, align 8 + %D_1 = load double, double *%idx7, align 8 + + %addAB_0 = fadd fast double %A_0, %B_0 + %subCD_0 = fsub fast double %C_0, %D_0 + + %addCD_1 = fadd fast double %C_1, %D_1 + %subAB_1 = fsub fast double %A_1, %B_1 + + %addABCD_0 = fadd fast double %addAB_0, %subCD_0 + %addCDAB_1 = fadd fast double %addCD_1, %subAB_1 + + store double %addABCD_0, double *%idx0, align 8 + store double %addCDAB_1, double *%idx1, align 8 + ret void +}