Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -317,8 +317,10 @@ /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? static Optional -isShuffle(ArrayRef VL, SmallVectorImpl &Mask) { +isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask) { auto *EI0 = cast(VL[0]); + if (isa(EI0->getVectorOperandType())) + return None; unsigned Size = cast(EI0->getVectorOperandType())->getNumElements(); Value *Vec1 = nullptr; @@ -4320,7 +4322,7 @@ // shuffle of a single/two vectors the scalars are extracted from. SmallVector Mask; Optional ShuffleKind = - isShuffle(VL, Mask); + isFixedVectorShuffle(VL, Mask); if (ShuffleKind.hasValue()) { // Found the bunch of extractelement instructions that must be gathered // into a vector and can be represented as a permutation elements in a @@ -4892,7 +4894,7 @@ VectorizableTree[0]->Scalars.size()) || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && - isShuffle(VectorizableTree[1]->Scalars, Mask)))) + isFixedVectorShuffle(VectorizableTree[1]->Scalars, Mask)))) return true; // Gathering cost would be too much for tiny trees. @@ -8912,7 +8914,7 @@ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && - isShuffle(BuildVectorOpds, Mask))) + isFixedVectorShuffle(BuildVectorOpds, Mask))) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); Index: llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll @@ -138,5 +138,62 @@ ret %ins4 } +define void @sext_scalable_extractelement() { +; CHECK-LABEL: @sext_scalable_extractelement( +; CHECK-NEXT: [[X0:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[X0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* undef, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* undef, i64 [[TMP4]] +; CHECK-NEXT: ret void +; + %x0 = extractelement undef, i32 undef + %1 = sext i32 %x0 to i64 + %2 = getelementptr inbounds i64, i64* undef, i64 %1 + %3 = extractelement undef, i32 undef + %4 = sext i32 %3 to i64 + %5 = getelementptr inbounds i64, i64* undef, i64 %4 + ret void +} + +define void @zext_scalable_extractelement() { +; CHECK-LABEL: @zext_scalable_extractelement( +; CHECK-NEXT: [[X0:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* undef, i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* undef, i64 [[TMP4]] +; CHECK-NEXT: ret void +; + %x0 = extractelement undef, i32 undef + %1 = zext i32 %x0 to i64 + %2 = getelementptr inbounds i64, i64* undef, i64 %1 + %3 = extractelement undef, i32 undef + %4 = zext i32 %3 to i64 + %5 = getelementptr inbounds i64, i64* undef, i64 %4 + ret void +} + +define void @trunc_scalable_extractelement() { +; CHECK-LABEL: @trunc_scalable_extractelement( +; CHECK-NEXT: [[X0:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* undef, i32 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement undef, i32 undef +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* undef, i32 [[TMP4]] +; CHECK-NEXT: ret void +; + %x0 = extractelement undef, i32 undef + %1 = trunc i64 %x0 to i32 + %2 = getelementptr inbounds i32, i32* undef, i32 %1 + %3 = extractelement undef, i32 undef + %4 = trunc i64 %3 to i32 + %5 = getelementptr inbounds i32, i32* undef, i32 %4 + ret void +} + declare @llvm.masked.load.nxv16i8.p0nxv16i8(*, i32 immarg, , ) declare void @llvm.masked.store.nxv16i8.p0nxv16i8(, *, i32 immarg, )