Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -873,6 +873,9 @@ /// ExtractElement, ExtractValue), which can be part of the graph. Optional findReusedOrderedScalars(const TreeEntry &TE); + /// Sort loads into increasing pointers offsets to allow greater clustering. + Optional findPartiallyOrderedLoads(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. @@ -3238,6 +3241,90 @@ return None; } +bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &SortedIndices) { + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); + // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each + // Ptr into, sort and return the sorted indices with values next to one + // another. + MapVector>> Bases; + Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); + + unsigned Cnt = 1; + for (auto *Ptr : VL.drop_front()) { + bool Found = false; + for (auto &Base : Bases) { + Optional Diff = + getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + continue; + + Base.second.push_back(std::make_tuple(Ptr, *Diff, Cnt++)); + Found = true; + break; + } + + if (!Found) { + // If we haven't found enough to usefully cluster, return early. + if (Bases.size() > VL.size() / 2 - 1) + return false; + + // Not found already - add a new Base + Bases[Ptr].push_back(std::make_tuple(Ptr, 0, Cnt++)); + } + } + + bool AnyConsecutive = false; + for (auto &Base : Bases) { + auto &Vec = Base.second; + if (Vec.size() > 1) { + llvm::sort(Vec, [](std::tuple &X, + std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int Start = std::get<1>(Vec.front()); + int End = std::get<1>(Vec.back()); + AnyConsecutive |= (End - Start) == int(Vec.size() - 1); + } + } + + // Fill SortedIndices array only if it looks worth-while to sort the ptrs. + SortedIndices.clear(); + if (!AnyConsecutive) + return false; + + for (auto &Base : Bases) { + for (auto &T : Base.second) + SortedIndices.push_back(std::get<2>(T)); + } + + assert(SortedIndices.size() == VL.size()); + return true; +} + +Optional +BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + Type *ScalarTy = TE.Scalars[0]->getType(); + + SmallVector Ptrs; + for (Value *V : TE.Scalars) { + auto *L = dyn_cast(V); + if (!L || !L->isSimple()) + return None; + Ptrs.push_back(L->getPointerOperand()); + } + + BoUpSLP::OrdersType Order; + if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + return Order; + return None; +} + Optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -3278,6 +3365,8 @@ } if (Optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; + if (Optional Order = findPartiallyOrderedLoads(TE)) + return Order; } return None; } Index: llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -344,42 +344,18 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y:%.*]] to <4 x i16>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y:%.*]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP22]]) -; CHECK-NEXT: ret i16 [[TMP23]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) +; CHECK-NEXT: ret i16 [[TMP11]] ; entry: %0 = load i16, i16* %x, align 2 @@ -442,75 +418,43 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P2:%.*]], align 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 -; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 -; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP8]], i64 4 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i64 5 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP2]], i64 6 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP0]], i64 7 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP20]], i64 12 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP16]], i64 13 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP14]], i64 14 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP12]], i64 15 -; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP11]], i64 4 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP37]], i8 [[TMP5]], i64 5 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP39]], i8 [[TMP1]], i64 7 -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP35]], i64 12 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP17]], i64 13 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP15]], i64 14 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP45]], i8 [[TMP13]], i64 15 -; CHECK-NEXT: [[TMP47:%.*]] = zext <16 x i8> [[TMP46]] to <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = mul nuw nsw <16 x i32> [[TMP32]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP48]]) -; CHECK-NEXT: ret i32 [[TMP49]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) +; CHECK-NEXT: ret i32 [[TMP29]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -752,11 +696,8 @@ ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[ADD7]] to i64 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM8]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX6]] to <2 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[IDXPROM11]] @@ -780,9 +721,8 @@ ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 2 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM8]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX41]] to <2 x i32>* +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, <2 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX48]], align 4 ; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[IDXPROM15]] @@ -797,27 +737,23 @@ ; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP13]], [[TMP2]] ; CHECK-NEXT: store i32 [[MUL73]], i32* [[Z]], align 4 ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 6 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP15]], i64 2 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP14]], i64 3 -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP4]], i64 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP3]], i64 3 -; CHECK-NEXT: [[TMP28:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = mul nsw <2 x i32> [[TMP12]], [[TMP1]] +; CHECK-NEXT: [[TMP23:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> [[TMP23]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP24]], align 4 ; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP16]], [[TMP5]] ; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 8 ; CHECK-NEXT: store i32 [[MUL81]], i32* [[ARRAYIDX82]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[ARRAYIDX76]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[SHUFFLE]], <2 x i32>* [[TMP31]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = mul nsw <2 x i32> [[TMP20]], [[TMP9]] +; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP7]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX76]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SHUFFLE1]], <2 x i32>* [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = mul nsw <2 x i32> [[TMP20]], [[TMP9]] ; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 10 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP32]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX90]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[SHUFFLE1]], <2 x i32>* [[TMP33]], align 4 +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARRAYIDX90]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[SHUFFLE2]], <2 x i32>* [[TMP28]], align 4 ; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP21]], [[TMP10]] ; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 9 ; CHECK-NEXT: store i32 [[MUL91]], i32* [[ARRAYIDX92]], align 4 @@ -924,42 +860,18 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y:%.*]] to <4 x i16>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y:%.*]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP22]], <8 x i16>* [[TMP23]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -137,12 +137,12 @@ ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: