Index: llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -113,9 +113,11 @@ /// Try to find horizontal reduction or otherwise vectorize a chain of binary /// operators. + /// \p MinRdxVals specifies the minimum number of values needed to form a + /// a reduction. bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, slpvectorizer::BoUpSLP &R, - TargetTransformInfo *TTI); + TargetTransformInfo *TTI, unsigned MinRdxVals); /// Try to vectorize trees that start at insertvalue instructions. bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6475,7 +6475,7 @@ /// Attempt to vectorize the tree found by /// matchAssociativeReduction. - bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { + bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, unsigned MinRdxVals) { if (ReducedVals.empty()) return false; @@ -6483,11 +6483,11 @@ // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + if (NumReducedVals < MinRdxVals) return false; unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - + unsigned MinRdxWidth = Log2_32(MinRdxVals); Value *VectorizedTree = nullptr; // FIXME: Fast-math-flags should be set based on the instructions in the @@ -6511,7 +6511,7 @@ SmallVector IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); @@ -6837,7 +6837,7 @@ /// performed. static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, + TargetTransformInfo *TTI, unsigned MinRdxVals, const function_ref Vectorize) { if (!ShouldVectorizeHor) return false; @@ -6868,7 +6868,7 @@ if (BI || SI) { HorizontalReduction HorRdx; if (HorRdx.matchAssociativeReduction(P, Inst)) { - if (HorRdx.tryToReduce(R, TTI)) { + if (HorRdx.tryToReduce(R, TTI, MinRdxVals)) { Res = true; // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. @@ -6911,7 +6911,8 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI) { + TargetTransformInfo *TTI, + unsigned MinRdxVals) { if (!V) return false; auto *I = dyn_cast(V); @@ -6924,7 +6925,7 @@ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { return tryToVectorize(I, R); }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, MinRdxVals, ExtraVectorization); } @@ -6967,7 +6968,7 @@ bool OpsChanged = false; for (int Idx = 0; Idx < 2; ++Idx) { OpsChanged |= - vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); + vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI, 4); } return OpsChanged; } @@ -7079,7 +7080,7 @@ // Try to match and vectorize a horizontal reduction. if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { + TTI, 4)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -7097,8 +7098,8 @@ bool OpsChanged = false; if (ShouldStartVectorizeHorAtStore || !isa(it)) { for (auto *V : it->operand_values()) { - // Try to match and vectorize a horizontal reduction. - OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI); + // Try to match a horizontal reduction of at least 4 elements. + OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI, 4); } } // Start vectorization of post-process list of instructions from the @@ -7120,6 +7121,22 @@ PostProcessInstructions.push_back(&*it); } + // Make a final attempt to match reductions -- including 2-way reductions -- + // if nothing else worked. We do not try this above because it may interfere + // with other vectorization attempts. + // TODO: The constraints are copied from the above call to + // vectorizeRootInstruction(), but that might be too restrictive? + BasicBlock::iterator LastInst = --BB->end(); + if (!Changed && LastInst->use_empty() && + (LastInst->getType()->isVoidTy() || isa(LastInst) || + isa(LastInst))) { + if (ShouldStartVectorizeHorAtStore || !isa(LastInst)) { + for (auto *V : LastInst->operand_values()) { + Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI, 2); + } + } + } + return Changed; } Index: llvm/test/Feature/weak_constant.ll =================================================================== --- llvm/test/Feature/weak_constant.ll +++ llvm/test/Feature/weak_constant.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -O3 -S > %t -; RUN: grep undef %t | count 1 +; RUN: grep undef %t | count 2 ; RUN: grep 5 %t | count 1 ; RUN: grep 7 %t | count 1 ; RUN: grep 9 %t | count 1 Index: llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -54,10 +54,10 @@ define i1 @two_wide_fcmp_reduction(<2 x double> %a0) { ; CHECK-LABEL: @two_wide_fcmp_reduction( ; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], -; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0 -; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1 -; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]] -; CHECK-NEXT: ret i1 [[D]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i1 [[TMP1]] ; %a = fcmp ogt <2 x double> %a0, %b = extractelement <2 x i1> %a, i32 0 @@ -96,12 +96,11 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0