diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6134,6 +6134,16 @@ PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); + bool IsWholeSubvector = + OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); + // Check if we can safely insert a subvector. If it is not possible, just + // generate a whole-sized vector and shuffle the source vector and the new + // subvector. + if (OffsetBeg + InsertVecSz > VecSz) { + // Align OffsetBeg to generate correct mask. + OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); + InsertVecSz = VecSz; + } APInt DemandedElts = APInt::getZero(NumElts); // TODO: Add support for Instruction::InsertValue. @@ -6177,7 +6187,7 @@ // TODO: Implement the analysis of the FirstInsert->getOperand(0) // subvector of ActualVecTy. if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts && - (Offset != OffsetBeg || (OffsetEnd + 1) % VecScalarsSz != 0)) { + !IsWholeSubvector) { if (InsertVecSz != VecSz) { auto *ActualVecTy = FixedVectorType::get(SrcVecTy->getElementType(), VecSz); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s + +define void @test(ptr %p) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[INC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> zeroinitializer, i16 [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; +entry: + %inc = getelementptr inbounds i16, ptr %p, i64 1 + %0 = load i16, ptr %inc, align 4 + %1 = load i16, ptr %p, align 2 + %2 = insertelement <8 x i16> zeroinitializer, i16 %0, i32 5 + %3 = insertelement <8 x i16> %2, i16 %1, i32 7 + ret void +}