diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2016,11 +2016,13 @@ /// for a pair which have highest score deemed to have best chance to form /// root of profitable tree to vectorize. Return None if no candidate scored /// above the LookAheadHeuristics::ScoreFail. + /// \param Limit Lower limit of the cost, considered to be good enough score. Optional - findBestRootPair(ArrayRef> Candidates) { + findBestRootPair(ArrayRef> Candidates, + int Limit = LookAheadHeuristics::ScoreFail) { LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, RootLookAheadMaxDepth); - int BestScore = LookAheadHeuristics::ScoreFail; + int BestScore = Limit; Optional Index = None; for (int I : seq(0, Candidates.size())) { int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, @@ -4524,10 +4526,64 @@ // If all of the operands are identical or constant we have a simple solution. // If we deal with insert/extract instructions, they all must have constant // indices, otherwise we should gather them, not try to vectorize. + // If alternate op node with 2 elements with gathered operands - do not + // vectorize. + auto &&NotProfitableForVectorization = [&S, this, + Depth](ArrayRef VL) { + if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2) + return false; + if (VectorizableTree.size() < MinTreeSize) + return false; + if (Depth >= RecursionMaxDepth - 1) + return true; + // Check if all operands are extracts, part of vector node or can build a + // regular vectorize node. + SmallVector InstsCount(VL.size(), 0); + for (Value *V : VL) { + auto *I = cast(V); + InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) { + return isa(Op) || isVectorLikeInstWithConstOps(Op); + })); + } + bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp); + if ((IsCommutative && + std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) || + (!IsCommutative && + all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; }))) + return true; + assert(VL.size() == 2 && "Expected only 2 alternate op instructions."); + SmallVector>> Candidates; + auto *I1 = cast(VL.front()); + auto *I2 = cast(VL.back()); + for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + Candidates.emplace_back().emplace_back(I1->getOperand(Op), + I2->getOperand(Op)); + if (count_if( + Candidates, [this](ArrayRef> Cand) { + return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); + }) >= S.MainOp->getNumOperands() / 2) + return false; + if (S.MainOp->getNumOperands() > 2) + return true; + if (IsCommutative) { + // Check permuted operands. + Candidates.clear(); + for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op) + Candidates.emplace_back().emplace_back(I1->getOperand(Op), + I2->getOperand((Op + 1) % E)); + if (any_of( + Candidates, [this](ArrayRef> Cand) { + return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat); + })) + return false; + } + return true; + }; if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode() || (isa(S.MainOp) && - !all_of(VL, isVectorLikeInstWithConstOps))) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + !all_of(VL, isVectorLikeInstWithConstOps)) || + NotProfitableForVectorization(VL)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-4 | FileCheck %s --check-prefix=CHECK -; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 -slp-min-tree-size=6 | FileCheck %s --check-prefix=FORCE_REDUCTION +; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-6 -slp-min-tree-size=5 | FileCheck %s --check-prefix=FORCE_REDUCTION define void @Test(i32) { ; CHECK-LABEL: @Test( @@ -79,24 +79,22 @@ ; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP32:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = add <8 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) -; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP26]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX13:%.*]] = and i32 [[TMP29]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX14:%.*]] = and i32 [[OP_RDX13]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX15:%.*]] = and i32 [[OP_RDX14]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX16:%.*]] = and i32 [[OP_RDX15]], [[TMP27]] -; FORCE_REDUCTION-NEXT: [[OP_RDX17:%.*]] = and i32 [[OP_RDX16]], [[TMP28]] -; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX17]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP31]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP30]], [[TMP33]] -; FORCE_REDUCTION-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP30]], [[TMP33]] -; FORCE_REDUCTION-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], +; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) +; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]] +; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]] +; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP26]] +; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP26]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP32]] = insertelement <2 x i32> [[TMP31]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,18 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: unreachable ; entry: