diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5535,10 +5535,6 @@ for (auto *V : VL) { ++Idx; - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == UndefMaskElem) - continue; - // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { RegMask.assign(EltsPerVector, UndefMaskElem); @@ -5546,6 +5542,10 @@ continue; } + // Need to exclude undefs from analysis. + if (isa(V) || Mask[Idx] == UndefMaskElem) + continue; + // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast(V)); @@ -5990,23 +5990,35 @@ assert(E->ReuseShuffleIndices.empty() && "Unique insertelements only are expected."); auto *SrcVecTy = cast(VL0->getType()); - unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); + + unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + + unsigned OffsetBeg = *getInsertIndex(VL.front()); + unsigned OffsetEnd = *getInsertIndex(VL.back()); + unsigned VecSz = NumElts; + unsigned VecScalarsSz = NumScalars; + if (NumOfParts > 0) { + VecScalarsSz = NumElts / NumOfParts; + VecSz = PowerOf2Ceil( + (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * + VecScalarsSz); + } + APInt DemandedElts = APInt::getZero(NumElts); // TODO: Add support for Instruction::InsertValue. SmallVector Mask; if (!E->ReorderIndices.empty()) { inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); } else { - Mask.assign(NumElts, UndefMaskElem); + Mask.assign(NumScalars, UndefMaskElem); std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); } - unsigned Offset = *getInsertIndex(VL0); bool IsIdentity = true; - SmallVector PrevMask(NumElts, UndefMaskElem); + SmallVector PrevMask(VecSz, UndefMaskElem); Mask.swap(PrevMask); + unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); for (unsigned I = 0; I < NumScalars; ++I) { unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); DemandedElts.setBit(InsertIdx); @@ -6019,31 +6031,28 @@ Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { - // FIXME: Replace with SK_InsertSubvector once it is properly supported. - unsigned Sz = PowerOf2Ceil(Offset + NumScalars); + // First cost - resize to actual vector size if not identity shuffle or + // need to shift the vector. + // Do not calculate the cost if the actual size is the register size and + // we can merge this shuffle with the following SK_Select. + auto *ActualVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + if ((!IsIdentity || Offset != OffsetBeg) && VecScalarsSz != VecSz) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + ActualVecTy, Mask); + auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast(V)->getOperand(0)); + })); + // Second cost - permutation with subvector, if some elements are from the + // initial vector or inserting a subvector. + // TODO: Implement the analysis of the FirstInsert->getOperand(0) + // subvector of ActualVecTy. + if (!isUndefVector(FirstInsert->getOperand(0)) && Offset != OffsetBeg) Cost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(SrcVecTy->getElementType(), Sz)); - } else if (!IsIdentity) { - auto *FirstInsert = - cast(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, - cast(V)->getOperand(0)); - })); - if (isUndefVector(FirstInsert->getOperand(0))) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); - } else { - SmallVector InsertMask(NumElts); - std::iota(InsertMask.begin(), InsertMask.end(), 0); - for (unsigned I = 0; I < NumElts; I++) { - if (Mask[I] != UndefMaskElem) - InsertMask[Offset + I] = NumElts + I; - } - Cost += - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); - } - } + TTI::SK_Select, + NumOfParts > 0 + ? FixedVectorType::get(SrcVecTy->getElementType(), VecScalarsSz) + : ActualVecTy); return Cost; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -342,20 +342,44 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 +; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 +; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) -; CHECK-NEXT: ret i16 [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP8]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP9]], i64 5 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP10]], i64 6 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP11]], i64 7 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP2]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP3]], i64 5 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP4]], i64 6 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP5]], i64 7 +; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP22]]) +; CHECK-NEXT: ret i16 [[TMP23]] ; entry: %0 = load i16, i16* %x, align 2 @@ -420,41 +444,87 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 +; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 +; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 +; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P2]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[P1]] to <4 x i8>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) -; CHECK-NEXT: ret i32 [[TMP29]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX13_1]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX20_1]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[ARRAYIDX25_1]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP0]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP2]], i64 5 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP4]], i64 6 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP8]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP12]], i64 8 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP16]], i64 9 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP20]], i64 10 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i8> [[TMP33]], i8 [[TMP24]], i64 11 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP13]], i64 12 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP17]], i64 13 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP21]], i64 14 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP37]], i8 [[TMP25]], i64 15 +; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = load i8, i8* [[ARRAYIDX37_1]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP1]], i64 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP3]], i64 5 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP5]], i64 6 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP45]], i8 [[TMP11]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP14]], i64 8 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x i8> [[TMP47]], i8 [[TMP18]], i64 9 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP22]], i64 10 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP40]], i64 11 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP15]], i64 12 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP19]], i64 13 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP23]], i64 14 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP41]], i64 15 +; CHECK-NEXT: [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = mul nuw nsw <16 x i32> [[TMP39]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP56]]) +; CHECK-NEXT: ret i32 [[TMP57]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -854,20 +924,45 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 +; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 +; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP8]], i64 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP9]], i64 5 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP10]], i64 6 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP11]], i64 7 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP2]], i64 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP3]], i64 5 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP4]], i64 6 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP5]], i64 7 +; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[TMP22]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP23]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -1237,105 +1332,197 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P1:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[P2:%.*]], align 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 5 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 +; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX20_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 1 +; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[ARRAYIDX10_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 5 +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX20_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[ARRAYIDX32_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 3 +; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX34_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 7 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, <4 x i8>* [[TMP12]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX3_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP37]], <4 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x i8> [[TMP38]], <16 x i8> [[TMP39]], <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i8> [[TMP44]], <16 x i8> [[TMP45]], <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i8> [[TMP46]], <16 x i8> [[TMP47]], <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i8> [[TMP50]], <16 x i8> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, <4 x i8>* [[TMP26]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX13]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX25]], align 1 +; CHECK-NEXT: [[TMP31:%.*]] = load i8, i8* [[ARRAYIDX37]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX13_1]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX25_1]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX37_1]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX13_2]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = load i8, i8* [[ARRAYIDX25_2]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = load i8, i8* [[ARRAYIDX37_2]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP41:%.*]] = load <4 x i8>, <4 x i8>* [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x i8> [[TMP41]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP36]], i64 4 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP37]], i64 5 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP38]], i64 6 +; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP45]], i8 [[TMP39]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP32]], i64 8 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x i8> [[TMP47]], i8 [[TMP33]], i64 9 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP34]], i64 10 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP49]], i8 [[TMP35]], i64 11 +; CHECK-NEXT: [[TMP51:%.*]] = insertelement <16 x i8> [[TMP50]], i8 [[TMP28]], i64 12 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP29]], i64 13 +; CHECK-NEXT: [[TMP53:%.*]] = insertelement <16 x i8> [[TMP52]], i8 [[TMP30]], i64 14 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP31]], i64 15 ; CHECK-NEXT: [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP49]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP29]], [[TMP43]] -; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP56:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 +; CHECK-NEXT: [[TMP58:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 +; CHECK-NEXT: [[TMP59:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 +; CHECK-NEXT: [[TMP60:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP61:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 +; CHECK-NEXT: [[TMP62:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 +; CHECK-NEXT: [[TMP63:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 +; CHECK-NEXT: [[TMP64:%.*]] = load i8, i8* [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX15_2]], align 1 +; CHECK-NEXT: [[TMP66:%.*]] = load i8, i8* [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[TMP67:%.*]] = load i8, i8* [[ARRAYIDX39_2]], align 1 +; CHECK-NEXT: [[TMP68:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP69:%.*]] = load <4 x i8>, <4 x i8>* [[TMP68]], align 1 +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <4 x i8> [[TMP69]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <16 x i8> [[TMP70]], i8 [[TMP64]], i64 4 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP65]], i64 5 +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <16 x i8> [[TMP72]], i8 [[TMP66]], i64 6 +; CHECK-NEXT: [[TMP74:%.*]] = insertelement <16 x i8> [[TMP73]], i8 [[TMP67]], i64 7 +; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP74]], i8 [[TMP60]], i64 8 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <16 x i8> [[TMP75]], i8 [[TMP61]], i64 9 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP62]], i64 10 +; CHECK-NEXT: [[TMP78:%.*]] = insertelement <16 x i8> [[TMP77]], i8 [[TMP63]], i64 11 +; CHECK-NEXT: [[TMP79:%.*]] = insertelement <16 x i8> [[TMP78]], i8 [[TMP56]], i64 12 +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP79]], i8 [[TMP57]], i64 13 +; CHECK-NEXT: [[TMP81:%.*]] = insertelement <16 x i8> [[TMP80]], i8 [[TMP58]], i64 14 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP59]], i64 15 +; CHECK-NEXT: [[TMP83:%.*]] = zext <16 x i8> [[TMP82]] to <16 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP84]], i8 [[TMP16]], i64 4 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <16 x i8> [[TMP85]], i8 [[TMP18]], i64 5 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP20]], i64 6 +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <16 x i8> [[TMP87]], i8 [[TMP22]], i64 7 +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <16 x i8> [[TMP88]], i8 [[TMP8]], i64 8 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP89]], i8 [[TMP10]], i64 9 +; CHECK-NEXT: [[TMP91:%.*]] = insertelement <16 x i8> [[TMP90]], i8 [[TMP12]], i64 10 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP14]], i64 11 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <16 x i8> [[TMP92]], i8 [[TMP0]], i64 12 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <16 x i8> [[TMP93]], i8 [[TMP2]], i64 13 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP94]], i8 [[TMP4]], i64 14 +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <16 x i8> [[TMP95]], i8 [[TMP6]], i64 15 +; CHECK-NEXT: [[TMP97:%.*]] = zext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <16 x i8> [[TMP98]], i8 [[TMP17]], i64 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <16 x i8> [[TMP99]], i8 [[TMP19]], i64 5 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP21]], i64 6 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> [[TMP101]], i8 [[TMP23]], i64 7 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP9]], i64 8 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP11]], i64 9 +; CHECK-NEXT: [[TMP105:%.*]] = insertelement <16 x i8> [[TMP104]], i8 [[TMP13]], i64 10 +; CHECK-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP105]], i8 [[TMP15]], i64 11 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <16 x i8> [[TMP106]], i8 [[TMP1]], i64 12 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP3]], i64 13 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP108]], i8 [[TMP5]], i64 14 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <16 x i8> [[TMP109]], i8 [[TMP7]], i64 15 +; CHECK-NEXT: [[TMP111:%.*]] = zext <16 x i8> [[TMP110]] to <16 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = sub nsw <16 x i32> [[TMP97]], [[TMP111]] +; CHECK-NEXT: [[TMP113:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP83]] +; CHECK-NEXT: [[TMP114:%.*]] = shl nsw <16 x i32> [[TMP113]], +; CHECK-NEXT: [[TMP115:%.*]] = add nsw <16 x i32> [[TMP114]], [[TMP112]] +; CHECK-NEXT: [[TMP116:%.*]] = shufflevector <16 x i32> [[TMP115]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <16 x i32> [[TMP116]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP118:%.*]] = add nsw <16 x i32> [[TMP116]], [[TMP117]] +; CHECK-NEXT: [[TMP119:%.*]] = sub nsw <16 x i32> [[TMP116]], [[TMP117]] +; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <16 x i32> [[TMP118]], <16 x i32> [[TMP119]], <16 x i32> +; CHECK-NEXT: [[TMP121:%.*]] = shufflevector <16 x i32> [[TMP120]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP122:%.*]] = add nsw <16 x i32> [[TMP120]], [[TMP121]] +; CHECK-NEXT: [[TMP123:%.*]] = sub nsw <16 x i32> [[TMP120]], [[TMP121]] +; CHECK-NEXT: [[TMP124:%.*]] = shufflevector <16 x i32> [[TMP122]], <16 x i32> [[TMP123]], <16 x i32> +; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <16 x i32> [[TMP124]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP126:%.*]] = add nsw <16 x i32> [[TMP124]], [[TMP125]] +; CHECK-NEXT: [[TMP127:%.*]] = sub nsw <16 x i32> [[TMP124]], [[TMP125]] +; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <16 x i32> [[TMP126]], <16 x i32> [[TMP127]], <16 x i32> +; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <16 x i32> [[TMP128]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP130:%.*]] = add nsw <16 x i32> [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[TMP131:%.*]] = sub nsw <16 x i32> [[TMP128]], [[TMP129]] +; CHECK-NEXT: [[TMP132:%.*]] = shufflevector <16 x i32> [[TMP130]], <16 x i32> [[TMP131]], <16 x i32> +; CHECK-NEXT: [[TMP133:%.*]] = lshr <16 x i32> [[TMP132]], +; CHECK-NEXT: [[TMP134:%.*]] = and <16 x i32> [[TMP133]], +; CHECK-NEXT: [[TMP135:%.*]] = mul nuw <16 x i32> [[TMP134]], +; CHECK-NEXT: [[TMP136:%.*]] = add <16 x i32> [[TMP135]], [[TMP132]] +; CHECK-NEXT: [[TMP137:%.*]] = xor <16 x i32> [[TMP136]], [[TMP135]] +; CHECK-NEXT: [[TMP138:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP137]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP138]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP138]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -230,21 +230,25 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R52]], <8 x i32> [[TMP12]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -230,21 +230,25 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R52]], <8 x i32> [[TMP12]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32>