diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -164,6 +164,15 @@ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores")); +// The maximum depth that the look-ahead score heuristic will explore +// when it probing among candidates for vectorization tree roots. +// The higher this value, the higher the compilation time overhead but unlike +// similar limit for operands ordering this is less frequently used, hence +// impact of higher value is less noticeable. +static cl::opt RootLookAheadMaxDepth( + "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for searching best rooting option")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -2000,6 +2009,29 @@ #endif }; + /// Evaluate each pair in \p Candidates and return index into \p Candidates + /// for a pair which have highest score deemed to have best chance to form + /// root of profitable tree to vectorize. Return None if no candidate scored + /// above the LookAheadHeuristics::ScoreFail. + Optional + findBestRootPair(ArrayRef> Candidates) { + LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, + RootLookAheadMaxDepth); + int BestScore = LookAheadHeuristics::ScoreFail; + Optional Index = None; + for (int I : seq(0, Candidates.size())) { + int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, + Candidates[I].second, + /*U1=*/nullptr, /*U2=*/nullptr, + /*Level=*/1, None); + if (Score > BestScore) { + BestScore = Score; + Index = I; + } + } + return Index; + } + /// Checks if the instruction is marked for deletion. bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } @@ -9150,7 +9182,8 @@ if (!I) return false; - if (!isa(I) && !isa(I)) + if ((!isa(I) && !isa(I)) || + isa(I->getType())) return false; Value *P = I->getParent(); @@ -9161,32 +9194,40 @@ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) return false; - // Try to vectorize V. - if (tryToVectorizePair(Op0, Op1, R)) - return true; + // First collect all possible candidates + SmallVector, 4> Candidates; + Candidates.emplace_back(Op0, Op1); auto *A = dyn_cast(Op0); auto *B = dyn_cast(Op1); // Try to skip B. - if (B && B->hasOneUse()) { + if (A && B && B->hasOneUse()) { auto *B0 = dyn_cast(B->getOperand(0)); auto *B1 = dyn_cast(B->getOperand(1)); - if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) - return true; - if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) - return true; + if (B0 && B0->getParent() == P) + Candidates.emplace_back(A, B0); + if (B1 && B1->getParent() == P) + Candidates.emplace_back(A, B1); } - // Try to skip A. - if (A && A->hasOneUse()) { + if (B && A && A->hasOneUse()) { auto *A0 = dyn_cast(A->getOperand(0)); auto *A1 = dyn_cast(A->getOperand(1)); - if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) - return true; - if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) - return true; + if (A0 && A0->getParent() == P) + Candidates.emplace_back(A0, B); + if (A1 && A1->getParent() == P) + Candidates.emplace_back(A1, B); } - return false; + + if (Candidates.size() == 1) + return tryToVectorizePair(Op0, Op1, R); + + // We have multiple options. Try to pick the single best. + Optional BestCandidate = R.findBestRootPair(Candidates); + if (!BestCandidate) + return false; + return tryToVectorizePair(Candidates[*BestCandidate].first, + Candidates[*BestCandidate].second, R); } namespace { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/invalid_type.ll @@ -1,12 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -S -pass-remarks-missed=slp-vectorizer 2>&1 | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" ; This test check that slp vectorizer is not trying to vectorize instructions already vectorized. -; CHECK: remark: :0:0: Cannot SLP vectorize list: type <16 x i8> is unsupported by vectorizer define void @vector() { +; CHECK-LABEL: @vector( +; CHECK-NEXT: [[LOAD0:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 1) +; CHECK-NEXT: [[LOAD1:%.*]] = tail call <16 x i8> @vector.load(<16 x i8>* undef, i32 2) +; CHECK-NEXT: [[ADD:%.*]] = add <16 x i8> [[LOAD1]], [[LOAD0]] +; CHECK-NEXT: tail call void @vector.store(<16 x i8> [[ADD]], <16 x i8>* undef, i32 1) +; CHECK-NEXT: ret void +; %load0 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 1) %load1 = tail call <16 x i8> @vector.load(<16 x i8> *undef, i32 2) %add = add <16 x i8> %load1, %load0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -157,8 +157,6 @@ ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -192,8 +192,6 @@ ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] -; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll @@ -17,27 +17,18 @@ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[I11:%.*]] = fmul fast double [[TMP4]], undef +; CHECK-NEXT: [[I09:%.*]] = fmul fast double [[TMP4]], undef +; CHECK-NEXT: [[I10:%.*]] = fsub fast double undef, [[I09]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x double> undef, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> undef, [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> , double [[I11]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x double> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <2 x double> [[TMP12]], -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fdiv fast <2 x double> [[TMP15]], -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x double> [[TMP15]], -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <2 x double> [[TMP18]], -; CHECK-NEXT: [[TMP20:%.*]] = fdiv fast <2 x double> [[TMP18]], -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP20]], <2 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x double> [[TMP21]], i32 1 -; CHECK-NEXT: [[I16:%.*]] = fadd fast double [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[I10]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = fdiv fast <2 x double> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 +; CHECK-NEXT: [[I07:%.*]] = fadd fast double undef, [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP10]], i32 1 +; CHECK-NEXT: [[I16:%.*]] = fadd fast double [[I07]], [[TMP12]] ; CHECK-NEXT: [[I17:%.*]] = fadd fast double [[I16]], [[C:%.*]] ; CHECK-NEXT: [[I18:%.*]] = fadd fast double [[I17]], [[D:%.*]] ; CHECK-NEXT: ret double [[I18]]