Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -921,6 +921,9 @@ /// ExtractElement, ExtractValue), which can be part of the graph. Optional findReusedOrderedScalars(const TreeEntry &TE); + /// Sort loads into increasing pointers offsets to allow greater clustering. + Optional findPartiallyOrderedLoads(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. @@ -3335,6 +3338,93 @@ return None; } +bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &SortedIndices) { + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); + // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each + // Ptr into, sort and return the sorted indices with values next to one + // another. + MapVector>> Bases; + Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); + + unsigned Cnt = 1; + for (Value *Ptr : VL.drop_front()) { + bool Found = any_of(Bases, [&](auto &Base) { + Optional Diff = + getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; + + Base.second.emplace_back(Ptr, *Diff, Cnt++); + return true; + }); + + if (!Found) { + // If we haven't found enough to usefully cluster, return early. + if (Bases.size() > VL.size() / 2 - 1) + return false; + + // Not found already - add a new Base + Bases[Ptr].emplace_back(Ptr, 0, Cnt++); + } + } + + // For each of the bases sort the pointers by Offset and check if any of the + // base become consecutively allocated. + bool AnyConsecutive = false; + for (auto &Base : Bases) { + auto &Vec = Base.second; + if (Vec.size() > 1) { + llvm::stable_sort(Vec, [](const std::tuple &X, + const std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) { + return std::get<1>(P.value()) == P.index() + InitialOffset; + }); + } + } + + // Fill SortedIndices array only if it looks worth-while to sort the ptrs. + SortedIndices.clear(); + if (!AnyConsecutive) + return false; + + for (auto &Base : Bases) { + for (auto &T : Base.second) + SortedIndices.push_back(std::get<2>(T)); + } + + assert(SortedIndices.size() == VL.size() && + "Expected SortedIndices to be the size of VL"); + return true; +} + +Optional +BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + Type *ScalarTy = TE.Scalars[0]->getType(); + + SmallVector Ptrs; + Ptrs.reserve(TE.Scalars.size()); + for (Value *V : TE.Scalars) { + auto *L = dyn_cast(V); + if (!L || !L->isSimple()) + return None; + Ptrs.push_back(L->getPointerOperand()); + } + + BoUpSLP::OrdersType Order; + if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + return Order; + return None; +} + Optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -3375,6 +3465,9 @@ } if (Optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; + if (TE.Scalars.size() >= 4) + if (Optional Order = findPartiallyOrderedLoads(TE)) + return Order; } return None; } Index: llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -342,44 +342,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP22]]) -; CHECK-NEXT: ret i16 [[TMP23]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) +; CHECK-NEXT: ret i16 [[TMP11]] ; entry: %0 = load i16, i16* %x, align 2 @@ -444,73 +420,41 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 -; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 -; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 -; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 -; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P2]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP8]], i64 4 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i64 5 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP2]], i64 6 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP0]], i64 7 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP20]], i64 12 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP16]], i64 13 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP14]], i64 14 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP12]], i64 15 -; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP11]], i64 4 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP37]], i8 [[TMP5]], i64 5 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP39]], i8 [[TMP1]], i64 7 -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP35]], i64 12 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP17]], i64 13 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP15]], i64 14 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP45]], i8 [[TMP13]], i64 15 -; CHECK-NEXT: [[TMP47:%.*]] = zext <16 x i8> [[TMP46]] to <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = mul nuw nsw <16 x i32> [[TMP32]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP48]]) -; CHECK-NEXT: ret i32 [[TMP49]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) +; CHECK-NEXT: ret i32 [[TMP29]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -750,9 +694,6 @@ ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[ADD7]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM8]] ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM11]] @@ -767,64 +708,51 @@ ; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 ; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 ; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 -; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM31]] ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 2 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM8]] ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM11]] ; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM19]] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX56]], align 4 ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM23]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX60]], align 4 ; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM31]] ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, i32* [[Z:%.*]], i64 1 ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 6 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[X]] to <2 x i32>* ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX6]] to <2 x i32>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[Y]] to <2 x i32>* ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP13]], i64 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i64 3 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP9]], i64 2 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP8]], i64 3 -; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX41]] to <2 x i32>* +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* ; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 7 ; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 11 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>* -; CHECK-NEXT: [[TMP23:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>* -; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i32>, <2 x i32>* [[TMP26]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX64]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX68]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>* +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>* +; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i32>, <2 x i32>* [[TMP19]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>* +; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>* +; CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, <2 x i32>* [[TMP23]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], i32* [[Z]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP16]], align 4 ; CHECK-NEXT: store i32 [[MUL85]], i32* [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], i32* [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP29]], i64 2 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i64 3 -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP23]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP25]], i64 2 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP24]], i64 3 -; CHECK-NEXT: [[TMP36:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP27]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -926,44 +854,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP22]], <8 x i16>* [[TMP23]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -110,12 +110,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: