diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4141,12 +4141,16 @@ // Handle splat and all-constants stores. Also try to vectorize tiny trees // with the second gather nodes if they have less scalar operands rather than // the initial tree element (may be profitable to shuffle the second gather). + SmallVector Mask; if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars) || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[1]->Scalars.size() < - VectorizableTree[0]->Scalars.size()))) + VectorizableTree[0]->Scalars.size()) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && + isShuffle(VectorizableTree[1]->Scalars, Mask)))) return true; // Gathering cost would be too much for tiny trees. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -17,12 +17,18 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4 ; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]] -; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 ; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[E1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4 ; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 ; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64