diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -929,6 +929,9 @@ /// ExtractElement, ExtractValue), which can be part of the graph. Optional findReusedOrderedScalars(const TreeEntry &TE); + /// Sort loads into increasing pointers offsets to allow greater clustering. + Optional findPartiallyOrderedLoads(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized /// - just return ReorderIndices, otherwise check if the scalars can be /// reordered and return the most optimal order. @@ -3441,6 +3444,93 @@ return None; } +bool clusterSortPtrAccesses(ArrayRef VL, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &SortedIndices) { + assert(llvm::all_of( + VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && + "Expected list of pointer operands."); + // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each + // Ptr into, sort and return the sorted indices with values next to one + // another. + MapVector>> Bases; + Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U)); + + unsigned Cnt = 1; + for (Value *Ptr : VL.drop_front()) { + bool Found = any_of(Bases, [&](auto &Base) { + Optional Diff = + getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE, + /*StrictCheck=*/true); + if (!Diff) + return false; + + Base.second.emplace_back(Ptr, *Diff, Cnt++); + return true; + }); + + if (!Found) { + // If we haven't found enough to usefully cluster, return early. + if (Bases.size() > VL.size() / 2 - 1) + return false; + + // Not found already - add a new Base + Bases[Ptr].emplace_back(Ptr, 0, Cnt++); + } + } + + // For each of the bases sort the pointers by Offset and check if any of the + // base become consecutively allocated. + bool AnyConsecutive = false; + for (auto &Base : Bases) { + auto &Vec = Base.second; + if (Vec.size() > 1) { + llvm::stable_sort(Vec, [](const std::tuple &X, + const std::tuple &Y) { + return std::get<1>(X) < std::get<1>(Y); + }); + int InitialOffset = std::get<1>(Vec[0]); + AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) { + return std::get<1>(P.value()) == int(P.index()) + InitialOffset; + }); + } + } + + // Fill SortedIndices array only if it looks worth-while to sort the ptrs. + SortedIndices.clear(); + if (!AnyConsecutive) + return false; + + for (auto &Base : Bases) { + for (auto &T : Base.second) + SortedIndices.push_back(std::get<2>(T)); + } + + assert(SortedIndices.size() == VL.size() && + "Expected SortedIndices to be the size of VL"); + return true; +} + +Optional +BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + Type *ScalarTy = TE.Scalars[0]->getType(); + + SmallVector Ptrs; + Ptrs.reserve(TE.Scalars.size()); + for (Value *V : TE.Scalars) { + auto *L = dyn_cast(V); + if (!L || !L->isSimple()) + return None; + Ptrs.push_back(L->getPointerOperand()); + } + + BoUpSLP::OrdersType Order; + if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order)) + return Order; + return None; +} + Optional BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { // No need to reorder if need to shuffle reuses, still need to shuffle the @@ -3481,6 +3571,9 @@ } if (Optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; + if (TE.Scalars.size() >= 4) + if (Optional Order = findPartiallyOrderedLoads(TE)) + return Order; } return None; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -342,44 +342,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP22]]) -; CHECK-NEXT: ret i16 [[TMP23]] +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP10]]) +; CHECK-NEXT: ret i16 [[TMP11]] ; entry: %0 = load i16, i16* %x, align 2 @@ -444,73 +420,41 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 -; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 -; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 -; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 -; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P2]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* -; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP3]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP8]], i64 4 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i64 5 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP2]], i64 6 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP0]], i64 7 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP20]], i64 12 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP16]], i64 13 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP14]], i64 14 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP12]], i64 15 -; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP11]], i64 4 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i8> [[TMP37]], i8 [[TMP5]], i64 5 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP39]], i8 [[TMP1]], i64 7 -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP35]], i64 12 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP17]], i64 13 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP44]], i8 [[TMP15]], i64 14 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i8> [[TMP45]], i8 [[TMP13]], i64 15 -; CHECK-NEXT: [[TMP47:%.*]] = zext <16 x i8> [[TMP46]] to <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = mul nuw nsw <16 x i32> [[TMP32]], [[TMP47]] -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP48]]) -; CHECK-NEXT: ret i32 [[TMP49]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = mul nuw nsw <16 x i32> [[TMP17]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP28]]) +; CHECK-NEXT: ret i32 [[TMP29]] ; entry: %idx.ext = sext i32 %off1 to i64 @@ -784,39 +728,31 @@ ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX41]] to <2 x i32>* ; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* +; CHECK-NEXT: [[TMP14:%.*]] = mul nsw <2 x i32> [[TMP11]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP13]], [[TMP9]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[ARRAYIDX72]] to <4 x i32>* ; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 7 ; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] ; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, i32* [[Z]], i64 11 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>* -; CHECK-NEXT: [[TMP23:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>* -; CHECK-NEXT: [[TMP25:%.*]] = load <2 x i32>, <2 x i32>* [[TMP24]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>* -; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i32>, <2 x i32>* [[TMP26]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>* -; CHECK-NEXT: [[TMP29:%.*]] = load <2 x i32>, <2 x i32>* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX12]] to <2 x i32>* +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[ARRAYIDX28]] to <2 x i32>* +; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i32>, <2 x i32>* [[TMP19]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX48]] to <2 x i32>* +; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX64]] to <2 x i32>* +; CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, <2 x i32>* [[TMP23]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], i32* [[Z]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP16]], align 4 ; CHECK-NEXT: store i32 [[MUL85]], i32* [[ARRAYIDX76]], align 4 ; CHECK-NEXT: store i32 [[MUL87]], i32* [[ARRAYIDX88]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP29]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP30]], <4 x i32> [[TMP31]], <4 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <2 x i32> [[TMP23]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> [[TMP34]], <4 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP35]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = mul nsw <2 x i32> [[TMP22]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <2 x i32> [[TMP24]], [[TMP20]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> [[TMP26]], <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[ARRAYIDX84]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP27]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -918,44 +854,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[STRIDE]], 1 -; CHECK-NEXT: [[IDXPROM6:%.*]] = sext i32 [[ADD5]] to i64 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[STRIDE]], 2 -; CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3 -; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[X]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM6]] -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM9]] -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds i16, i16* [[Y]], i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[X]] to <4 x i16>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX7]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[ARRAYIDX4]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[Y]] to <4 x i16>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[ARRAYIDX20]] to <4 x i16>* ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[TMP6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX20]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX23]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX26]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX29]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[TMP9]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[TMP8]], i64 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[TMP11]], i64 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[TMP10]], i64 7 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[TMP3]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x i16> [[TMP18]], i16 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[TMP5]], i64 6 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP20]], i16 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP22:%.*]] = mul <8 x i16> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP22]], <8 x i16>* [[TMP23]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i16> [[TMP7]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[DST0:%.*]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[SHUFFLE]], <8 x i16>* [[TMP10]], align 2 ; CHECK-NEXT: ret void ; entry: @@ -1325,421 +1237,105 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[P1:%.*]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[P2:%.*]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV2]] -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX3]], align 1 -; CHECK-NEXT: [[CONV4:%.*]] = zext i8 [[TMP2]] to i32 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[CONV6:%.*]] = zext i8 [[TMP3]] to i32 -; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 [[CONV4]], [[CONV6]] -; CHECK-NEXT: [[SHL:%.*]] = shl nsw i32 [[SUB7]], 16 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SHL]], [[SUB]] -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX10]], align 1 -; CHECK-NEXT: [[CONV11:%.*]] = zext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 [[CONV9]], [[CONV11]] -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 5 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX13]], align 1 -; CHECK-NEXT: [[CONV14:%.*]] = zext i8 [[TMP6]] to i32 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 5 -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX15]], align 1 -; CHECK-NEXT: [[CONV16:%.*]] = zext i8 [[TMP7]] to i32 -; CHECK-NEXT: [[SUB17:%.*]] = sub nsw i32 [[CONV14]], [[CONV16]] -; CHECK-NEXT: [[SHL18:%.*]] = shl nsw i32 [[SUB17]], 16 -; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[SHL18]], [[SUB12]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX20]], align 1 -; CHECK-NEXT: [[CONV21:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX22]], align 1 -; CHECK-NEXT: [[CONV23:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[SUB24:%.*]] = sub nsw i32 [[CONV21]], [[CONV23]] -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 6 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX25]], align 1 -; CHECK-NEXT: [[CONV26:%.*]] = zext i8 [[TMP10]] to i32 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 6 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX27]], align 1 -; CHECK-NEXT: [[CONV28:%.*]] = zext i8 [[TMP11]] to i32 -; CHECK-NEXT: [[SUB29:%.*]] = sub nsw i32 [[CONV26]], [[CONV28]] -; CHECK-NEXT: [[SHL30:%.*]] = shl nsw i32 [[SUB29]], 16 -; CHECK-NEXT: [[ADD31:%.*]] = add nsw i32 [[SHL30]], [[SUB24]] -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX32]], align 1 -; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP12]] to i32 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 3 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX34]], align 1 -; CHECK-NEXT: [[CONV35:%.*]] = zext i8 [[TMP13]] to i32 -; CHECK-NEXT: [[SUB36:%.*]] = sub nsw i32 [[CONV33]], [[CONV35]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 7 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX37]], align 1 -; CHECK-NEXT: [[CONV38:%.*]] = zext i8 [[TMP14]] to i32 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 7 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX39]], align 1 -; CHECK-NEXT: [[CONV40:%.*]] = zext i8 [[TMP15]] to i32 -; CHECK-NEXT: [[SUB41:%.*]] = sub nsw i32 [[CONV38]], [[CONV40]] -; CHECK-NEXT: [[SHL42:%.*]] = shl nsw i32 [[SUB41]], 16 -; CHECK-NEXT: [[ADD43:%.*]] = add nsw i32 [[SHL42]], [[SUB36]] -; CHECK-NEXT: [[ADD44:%.*]] = add nsw i32 [[ADD19]], [[ADD]] -; CHECK-NEXT: [[SUB45:%.*]] = sub nsw i32 [[ADD]], [[ADD19]] -; CHECK-NEXT: [[ADD46:%.*]] = add nsw i32 [[ADD43]], [[ADD31]] -; CHECK-NEXT: [[SUB47:%.*]] = sub nsw i32 [[ADD31]], [[ADD43]] -; CHECK-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD46]], [[ADD44]] -; CHECK-NEXT: [[SUB51:%.*]] = sub nsw i32 [[ADD44]], [[ADD46]] -; CHECK-NEXT: [[ADD55:%.*]] = add nsw i32 [[SUB47]], [[SUB45]] -; CHECK-NEXT: [[SUB59:%.*]] = sub nsw i32 [[SUB45]], [[SUB47]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP16]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[CONV2_1:%.*]] = zext i8 [[TMP17]] to i32 -; CHECK-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[CONV_1]], [[CONV2_1]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[CONV4_1:%.*]] = zext i8 [[TMP18]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[CONV6_1:%.*]] = zext i8 [[TMP19]] to i32 -; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 [[CONV4_1]], [[CONV6_1]] -; CHECK-NEXT: [[SHL_1:%.*]] = shl nsw i32 [[SUB7_1]], 16 -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[SHL_1]], [[SUB_1]] -; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 1 -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[CONV9_1:%.*]] = zext i8 [[TMP20]] to i32 -; CHECK-NEXT: [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 1 -; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX10_1]], align 1 -; CHECK-NEXT: [[CONV11_1:%.*]] = zext i8 [[TMP21]] to i32 -; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 [[CONV9_1]], [[CONV11_1]] -; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 5 -; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[ARRAYIDX13_1]], align 1 -; CHECK-NEXT: [[CONV14_1:%.*]] = zext i8 [[TMP22]] to i32 -; CHECK-NEXT: [[ARRAYIDX15_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 5 -; CHECK-NEXT: [[TMP23:%.*]] = load i8, i8* [[ARRAYIDX15_1]], align 1 -; CHECK-NEXT: [[CONV16_1:%.*]] = zext i8 [[TMP23]] to i32 -; CHECK-NEXT: [[SUB17_1:%.*]] = sub nsw i32 [[CONV14_1]], [[CONV16_1]] -; CHECK-NEXT: [[SHL18_1:%.*]] = shl nsw i32 [[SUB17_1]], 16 -; CHECK-NEXT: [[ADD19_1:%.*]] = add nsw i32 [[SHL18_1]], [[SUB12_1]] -; CHECK-NEXT: [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 2 -; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[ARRAYIDX20_1]], align 1 -; CHECK-NEXT: [[CONV21_1:%.*]] = zext i8 [[TMP24]] to i32 -; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 2 -; CHECK-NEXT: [[TMP25:%.*]] = load i8, i8* [[ARRAYIDX22_1]], align 1 -; CHECK-NEXT: [[CONV23_1:%.*]] = zext i8 [[TMP25]] to i32 -; CHECK-NEXT: [[SUB24_1:%.*]] = sub nsw i32 [[CONV21_1]], [[CONV23_1]] -; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 6 -; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[ARRAYIDX25_1]], align 1 -; CHECK-NEXT: [[CONV26_1:%.*]] = zext i8 [[TMP26]] to i32 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 6 -; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[CONV28_1:%.*]] = zext i8 [[TMP27]] to i32 -; CHECK-NEXT: [[SUB29_1:%.*]] = sub nsw i32 [[CONV26_1]], [[CONV28_1]] -; CHECK-NEXT: [[SHL30_1:%.*]] = shl nsw i32 [[SUB29_1]], 16 -; CHECK-NEXT: [[ADD31_1:%.*]] = add nsw i32 [[SHL30_1]], [[SUB24_1]] -; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 3 -; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ARRAYIDX32_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP28]] to i32 -; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 3 -; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX34_1]], align 1 -; CHECK-NEXT: [[CONV35_1:%.*]] = zext i8 [[TMP29]] to i32 -; CHECK-NEXT: [[SUB36_1:%.*]] = sub nsw i32 [[CONV33_1]], [[CONV35_1]] -; CHECK-NEXT: [[ARRAYIDX37_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 7 -; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX37_1]], align 1 -; CHECK-NEXT: [[CONV38_1:%.*]] = zext i8 [[TMP30]] to i32 -; CHECK-NEXT: [[ARRAYIDX39_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 7 -; CHECK-NEXT: [[TMP31:%.*]] = load i8, i8* [[ARRAYIDX39_1]], align 1 -; CHECK-NEXT: [[CONV40_1:%.*]] = zext i8 [[TMP31]] to i32 -; CHECK-NEXT: [[SUB41_1:%.*]] = sub nsw i32 [[CONV38_1]], [[CONV40_1]] -; CHECK-NEXT: [[SHL42_1:%.*]] = shl nsw i32 [[SUB41_1]], 16 -; CHECK-NEXT: [[ADD43_1:%.*]] = add nsw i32 [[SHL42_1]], [[SUB36_1]] -; CHECK-NEXT: [[ADD44_1:%.*]] = add nsw i32 [[ADD19_1]], [[ADD_1]] -; CHECK-NEXT: [[SUB45_1:%.*]] = sub nsw i32 [[ADD_1]], [[ADD19_1]] -; CHECK-NEXT: [[ADD46_1:%.*]] = add nsw i32 [[ADD43_1]], [[ADD31_1]] -; CHECK-NEXT: [[SUB47_1:%.*]] = sub nsw i32 [[ADD31_1]], [[ADD43_1]] -; CHECK-NEXT: [[ADD48_1:%.*]] = add nsw i32 [[ADD46_1]], [[ADD44_1]] -; CHECK-NEXT: [[SUB51_1:%.*]] = sub nsw i32 [[ADD44_1]], [[ADD46_1]] -; CHECK-NEXT: [[ADD55_1:%.*]] = add nsw i32 [[SUB47_1]], [[SUB45_1]] -; CHECK-NEXT: [[SUB59_1:%.*]] = sub nsw i32 [[SUB45_1]], [[SUB47_1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP32:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32 -; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32 -; CHECK-NEXT: [[SUB_2:%.*]] = sub nsw i32 [[CONV_2]], [[CONV2_2]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32 -; CHECK-NEXT: [[SUB7_2:%.*]] = sub nsw i32 [[CONV4_2]], [[CONV6_2]] -; CHECK-NEXT: [[SHL_2:%.*]] = shl nsw i32 [[SUB7_2]], 16 -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[SHL_2]], [[SUB_2]] -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 1 -; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32 -; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 1 -; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX10_2]], align 1 -; CHECK-NEXT: [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32 -; CHECK-NEXT: [[SUB12_2:%.*]] = sub nsw i32 [[CONV9_2]], [[CONV11_2]] -; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 5 -; CHECK-NEXT: [[TMP38:%.*]] = load i8, i8* [[ARRAYIDX13_2]], align 1 -; CHECK-NEXT: [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32 -; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 5 -; CHECK-NEXT: [[TMP39:%.*]] = load i8, i8* [[ARRAYIDX15_2]], align 1 -; CHECK-NEXT: [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32 -; CHECK-NEXT: [[SUB17_2:%.*]] = sub nsw i32 [[CONV14_2]], [[CONV16_2]] -; CHECK-NEXT: [[SHL18_2:%.*]] = shl nsw i32 [[SUB17_2]], 16 -; CHECK-NEXT: [[ADD19_2:%.*]] = add nsw i32 [[SHL18_2]], [[SUB12_2]] -; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 2 -; CHECK-NEXT: [[TMP40:%.*]] = load i8, i8* [[ARRAYIDX20_2]], align 1 -; CHECK-NEXT: [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32 -; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 2 -; CHECK-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX22_2]], align 1 -; CHECK-NEXT: [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32 -; CHECK-NEXT: [[SUB24_2:%.*]] = sub nsw i32 [[CONV21_2]], [[CONV23_2]] -; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 6 -; CHECK-NEXT: [[TMP42:%.*]] = load i8, i8* [[ARRAYIDX25_2]], align 1 -; CHECK-NEXT: [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32 -; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 6 -; CHECK-NEXT: [[TMP43:%.*]] = load i8, i8* [[ARRAYIDX27_2]], align 1 -; CHECK-NEXT: [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32 -; CHECK-NEXT: [[SUB29_2:%.*]] = sub nsw i32 [[CONV26_2]], [[CONV28_2]] -; CHECK-NEXT: [[SHL30_2:%.*]] = shl nsw i32 [[SUB29_2]], 16 -; CHECK-NEXT: [[ADD31_2:%.*]] = add nsw i32 [[SHL30_2]], [[SUB24_2]] -; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 3 -; CHECK-NEXT: [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX32_2]], align 1 -; CHECK-NEXT: [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32 -; CHECK-NEXT: [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 3 -; CHECK-NEXT: [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX34_2]], align 1 -; CHECK-NEXT: [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32 -; CHECK-NEXT: [[SUB36_2:%.*]] = sub nsw i32 [[CONV33_2]], [[CONV35_2]] -; CHECK-NEXT: [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 7 -; CHECK-NEXT: [[TMP46:%.*]] = load i8, i8* [[ARRAYIDX37_2]], align 1 -; CHECK-NEXT: [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32 -; CHECK-NEXT: [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 7 -; CHECK-NEXT: [[TMP47:%.*]] = load i8, i8* [[ARRAYIDX39_2]], align 1 -; CHECK-NEXT: [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32 -; CHECK-NEXT: [[SUB41_2:%.*]] = sub nsw i32 [[CONV38_2]], [[CONV40_2]] -; CHECK-NEXT: [[SHL42_2:%.*]] = shl nsw i32 [[SUB41_2]], 16 -; CHECK-NEXT: [[ADD43_2:%.*]] = add nsw i32 [[SHL42_2]], [[SUB36_2]] -; CHECK-NEXT: [[ADD44_2:%.*]] = add nsw i32 [[ADD19_2]], [[ADD_2]] -; CHECK-NEXT: [[SUB45_2:%.*]] = sub nsw i32 [[ADD_2]], [[ADD19_2]] -; CHECK-NEXT: [[ADD46_2:%.*]] = add nsw i32 [[ADD43_2]], [[ADD31_2]] -; CHECK-NEXT: [[SUB47_2:%.*]] = sub nsw i32 [[ADD31_2]], [[ADD43_2]] -; CHECK-NEXT: [[ADD48_2:%.*]] = add nsw i32 [[ADD46_2]], [[ADD44_2]] -; CHECK-NEXT: [[SUB51_2:%.*]] = sub nsw i32 [[ADD44_2]], [[ADD46_2]] -; CHECK-NEXT: [[ADD55_2:%.*]] = add nsw i32 [[SUB47_2]], [[SUB45_2]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub nsw i32 [[SUB45_2]], [[SUB47_2]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP48:%.*]] = load i8, i8* [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 -; CHECK-NEXT: [[TMP49:%.*]] = load i8, i8* [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32 -; CHECK-NEXT: [[SUB_3:%.*]] = sub nsw i32 [[CONV_3]], [[CONV2_3]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[TMP50:%.*]] = load i8, i8* [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP51:%.*]] = load i8, i8* [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32 -; CHECK-NEXT: [[SUB7_3:%.*]] = sub nsw i32 [[CONV4_3]], [[CONV6_3]] -; CHECK-NEXT: [[SHL_3:%.*]] = shl nsw i32 [[SUB7_3]], 16 -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[SHL_3]], [[SUB_3]] -; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 1 -; CHECK-NEXT: [[TMP52:%.*]] = load i8, i8* [[ARRAYIDX8_3]], align 1 -; CHECK-NEXT: [[CONV9_3:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 1 -; CHECK-NEXT: [[TMP53:%.*]] = load i8, i8* [[ARRAYIDX10_3]], align 1 -; CHECK-NEXT: [[CONV11_3:%.*]] = zext i8 [[TMP53]] to i32 -; CHECK-NEXT: [[SUB12_3:%.*]] = sub nsw i32 [[CONV9_3]], [[CONV11_3]] -; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 5 -; CHECK-NEXT: [[TMP54:%.*]] = load i8, i8* [[ARRAYIDX13_3]], align 1 -; CHECK-NEXT: [[CONV14_3:%.*]] = zext i8 [[TMP54]] to i32 -; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 5 -; CHECK-NEXT: [[TMP55:%.*]] = load i8, i8* [[ARRAYIDX15_3]], align 1 -; CHECK-NEXT: [[CONV16_3:%.*]] = zext i8 [[TMP55]] to i32 -; CHECK-NEXT: [[SUB17_3:%.*]] = sub nsw i32 [[CONV14_3]], [[CONV16_3]] -; CHECK-NEXT: [[SHL18_3:%.*]] = shl nsw i32 [[SUB17_3]], 16 -; CHECK-NEXT: [[ADD19_3:%.*]] = add nsw i32 [[SHL18_3]], [[SUB12_3]] -; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 2 -; CHECK-NEXT: [[TMP56:%.*]] = load i8, i8* [[ARRAYIDX20_3]], align 1 -; CHECK-NEXT: [[CONV21_3:%.*]] = zext i8 [[TMP56]] to i32 -; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 2 -; CHECK-NEXT: [[TMP57:%.*]] = load i8, i8* [[ARRAYIDX22_3]], align 1 -; CHECK-NEXT: [[CONV23_3:%.*]] = zext i8 [[TMP57]] to i32 -; CHECK-NEXT: [[SUB24_3:%.*]] = sub nsw i32 [[CONV21_3]], [[CONV23_3]] -; CHECK-NEXT: [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 6 -; CHECK-NEXT: [[TMP58:%.*]] = load i8, i8* [[ARRAYIDX25_3]], align 1 -; CHECK-NEXT: [[CONV26_3:%.*]] = zext i8 [[TMP58]] to i32 -; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 6 -; CHECK-NEXT: [[TMP59:%.*]] = load i8, i8* [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[CONV28_3:%.*]] = zext i8 [[TMP59]] to i32 -; CHECK-NEXT: [[SUB29_3:%.*]] = sub nsw i32 [[CONV26_3]], [[CONV28_3]] -; CHECK-NEXT: [[SHL30_3:%.*]] = shl nsw i32 [[SUB29_3]], 16 -; CHECK-NEXT: [[ADD31_3:%.*]] = add nsw i32 [[SHL30_3]], [[SUB24_3]] -; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 3 -; CHECK-NEXT: [[TMP60:%.*]] = load i8, i8* [[ARRAYIDX32_3]], align 1 -; CHECK-NEXT: [[CONV33_3:%.*]] = zext i8 [[TMP60]] to i32 -; CHECK-NEXT: [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 3 -; CHECK-NEXT: [[TMP61:%.*]] = load i8, i8* [[ARRAYIDX34_3]], align 1 -; CHECK-NEXT: [[CONV35_3:%.*]] = zext i8 [[TMP61]] to i32 -; CHECK-NEXT: [[SUB36_3:%.*]] = sub nsw i32 [[CONV33_3]], [[CONV35_3]] -; CHECK-NEXT: [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 7 -; CHECK-NEXT: [[TMP62:%.*]] = load i8, i8* [[ARRAYIDX37_3]], align 1 -; CHECK-NEXT: [[CONV38_3:%.*]] = zext i8 [[TMP62]] to i32 -; CHECK-NEXT: [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 7 -; CHECK-NEXT: [[TMP63:%.*]] = load i8, i8* [[ARRAYIDX39_3]], align 1 -; CHECK-NEXT: [[CONV40_3:%.*]] = zext i8 [[TMP63]] to i32 -; CHECK-NEXT: [[SUB41_3:%.*]] = sub nsw i32 [[CONV38_3]], [[CONV40_3]] -; CHECK-NEXT: [[SHL42_3:%.*]] = shl nsw i32 [[SUB41_3]], 16 -; CHECK-NEXT: [[ADD43_3:%.*]] = add nsw i32 [[SHL42_3]], [[SUB36_3]] -; CHECK-NEXT: [[ADD44_3:%.*]] = add nsw i32 [[ADD19_3]], [[ADD_3]] -; CHECK-NEXT: [[SUB45_3:%.*]] = sub nsw i32 [[ADD_3]], [[ADD19_3]] -; CHECK-NEXT: [[ADD46_3:%.*]] = add nsw i32 [[ADD43_3]], [[ADD31_3]] -; CHECK-NEXT: [[SUB47_3:%.*]] = sub nsw i32 [[ADD31_3]], [[ADD43_3]] -; CHECK-NEXT: [[ADD48_3:%.*]] = add nsw i32 [[ADD46_3]], [[ADD44_3]] -; CHECK-NEXT: [[SUB51_3:%.*]] = sub nsw i32 [[ADD44_3]], [[ADD46_3]] -; CHECK-NEXT: [[ADD55_3:%.*]] = add nsw i32 [[SUB47_3]], [[SUB45_3]] -; CHECK-NEXT: [[SUB59_3:%.*]] = sub nsw i32 [[SUB45_3]], [[SUB47_3]] -; CHECK-NEXT: [[ADD78:%.*]] = add nsw i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub nsw i32 [[ADD48]], [[ADD48_1]] -; CHECK-NEXT: [[ADD94:%.*]] = add nsw i32 [[ADD48_3]], [[ADD48_2]] -; CHECK-NEXT: [[SUB102:%.*]] = sub nsw i32 [[ADD48_2]], [[ADD48_3]] -; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 [[ADD94]], [[ADD78]] -; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 [[SUB102]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 [[SUB86]], [[SUB102]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] -; CHECK-NEXT: [[SHR_I184:%.*]] = lshr i32 [[ADD105]], 15 -; CHECK-NEXT: [[AND_I185:%.*]] = and i32 [[SHR_I184]], 65537 -; CHECK-NEXT: [[MUL_I186:%.*]] = mul nuw i32 [[AND_I185]], 65535 -; CHECK-NEXT: [[ADD_I187:%.*]] = add i32 [[MUL_I186]], [[ADD105]] -; CHECK-NEXT: [[XOR_I188:%.*]] = xor i32 [[ADD_I187]], [[MUL_I186]] -; CHECK-NEXT: [[SHR_I189:%.*]] = lshr i32 [[SUB104]], 15 -; CHECK-NEXT: [[AND_I190:%.*]] = and i32 [[SHR_I189]], 65537 -; CHECK-NEXT: [[MUL_I191:%.*]] = mul nuw i32 [[AND_I190]], 65535 -; CHECK-NEXT: [[ADD_I192:%.*]] = add i32 [[MUL_I191]], [[SUB104]] -; CHECK-NEXT: [[XOR_I193:%.*]] = xor i32 [[ADD_I192]], [[MUL_I191]] -; CHECK-NEXT: [[SHR_I194:%.*]] = lshr i32 [[SUB106]], 15 -; CHECK-NEXT: [[AND_I195:%.*]] = and i32 [[SHR_I194]], 65537 -; CHECK-NEXT: [[MUL_I196:%.*]] = mul nuw i32 [[AND_I195]], 65535 -; CHECK-NEXT: [[ADD_I197:%.*]] = add i32 [[MUL_I196]], [[SUB106]] -; CHECK-NEXT: [[XOR_I198:%.*]] = xor i32 [[ADD_I197]], [[MUL_I196]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I188]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I193]] -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I198]] -; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 [[ADD55_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 [[ADD55]], [[ADD55_1]] -; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 [[ADD55_2]], [[ADD55_3]] -; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] -; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] -; CHECK-NEXT: [[SHR_I184_1:%.*]] = lshr i32 [[ADD105_1]], 15 -; CHECK-NEXT: [[AND_I185_1:%.*]] = and i32 [[SHR_I184_1]], 65537 -; CHECK-NEXT: [[MUL_I186_1:%.*]] = mul nuw i32 [[AND_I185_1]], 65535 -; CHECK-NEXT: [[ADD_I187_1:%.*]] = add i32 [[MUL_I186_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I188_1:%.*]] = xor i32 [[ADD_I187_1]], [[MUL_I186_1]] -; CHECK-NEXT: [[SHR_I189_1:%.*]] = lshr i32 [[SUB104_1]], 15 -; CHECK-NEXT: [[AND_I190_1:%.*]] = and i32 [[SHR_I189_1]], 65537 -; CHECK-NEXT: [[MUL_I191_1:%.*]] = mul nuw i32 [[AND_I190_1]], 65535 -; CHECK-NEXT: [[ADD_I192_1:%.*]] = add i32 [[MUL_I191_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I193_1:%.*]] = xor i32 [[ADD_I192_1]], [[MUL_I191_1]] -; CHECK-NEXT: [[SHR_I194_1:%.*]] = lshr i32 [[SUB106_1]], 15 -; CHECK-NEXT: [[AND_I195_1:%.*]] = and i32 [[SHR_I194_1]], 65537 -; CHECK-NEXT: [[MUL_I196_1:%.*]] = mul nuw i32 [[AND_I195_1]], 65535 -; CHECK-NEXT: [[ADD_I197_1:%.*]] = add i32 [[MUL_I196_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I198_1:%.*]] = xor i32 [[ADD_I197_1]], [[MUL_I196_1]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I188_1]], [[ADD113]] -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I193_1]] -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I198_1]] -; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 [[SUB51_1]], [[SUB51]] -; CHECK-NEXT: [[SUB86_2:%.*]] = sub nsw i32 [[SUB51]], [[SUB51_1]] -; CHECK-NEXT: [[ADD94_2:%.*]] = add nsw i32 [[SUB51_3]], [[SUB51_2]] -; CHECK-NEXT: [[SUB102_2:%.*]] = sub nsw i32 [[SUB51_2]], [[SUB51_3]] -; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 [[ADD94_2]], [[ADD78_2]] -; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], [[ADD94_2]] -; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 [[SUB102_2]], [[SUB86_2]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 [[SUB86_2]], [[SUB102_2]] -; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 -; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 -; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 -; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] -; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] -; CHECK-NEXT: [[SHR_I184_2:%.*]] = lshr i32 [[ADD105_2]], 15 -; CHECK-NEXT: [[AND_I185_2:%.*]] = and i32 [[SHR_I184_2]], 65537 -; CHECK-NEXT: [[MUL_I186_2:%.*]] = mul nuw i32 [[AND_I185_2]], 65535 -; CHECK-NEXT: [[ADD_I187_2:%.*]] = add i32 [[MUL_I186_2]], [[ADD105_2]] -; CHECK-NEXT: [[XOR_I188_2:%.*]] = xor i32 [[ADD_I187_2]], [[MUL_I186_2]] -; CHECK-NEXT: [[SHR_I189_2:%.*]] = lshr i32 [[SUB104_2]], 15 -; CHECK-NEXT: [[AND_I190_2:%.*]] = and i32 [[SHR_I189_2]], 65537 -; CHECK-NEXT: [[MUL_I191_2:%.*]] = mul nuw i32 [[AND_I190_2]], 65535 -; CHECK-NEXT: [[ADD_I192_2:%.*]] = add i32 [[MUL_I191_2]], [[SUB104_2]] -; CHECK-NEXT: [[XOR_I193_2:%.*]] = xor i32 [[ADD_I192_2]], [[MUL_I191_2]] -; CHECK-NEXT: [[SHR_I194_2:%.*]] = lshr i32 [[SUB106_2]], 15 -; CHECK-NEXT: [[AND_I195_2:%.*]] = and i32 [[SHR_I194_2]], 65537 -; CHECK-NEXT: [[MUL_I196_2:%.*]] = mul nuw i32 [[AND_I195_2]], 65535 -; CHECK-NEXT: [[ADD_I197_2:%.*]] = add i32 [[MUL_I196_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I198_2:%.*]] = xor i32 [[ADD_I197_2]], [[MUL_I196_2]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I188_2]], [[ADD113_1]] -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I193_2]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I198_2]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add nsw i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub nsw i32 [[SUB59]], [[SUB59_1]] -; CHECK-NEXT: [[ADD94_3:%.*]] = add nsw i32 [[SUB59_3]], [[SUB59_2]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 [[SUB59_2]], [[SUB59_3]] -; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 [[ADD94_3]], [[ADD78_3]] -; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 [[ADD78_3]], [[ADD94_3]] -; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], [[SUB86_3]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 [[SUB86_3]], [[SUB102_3]] -; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 -; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 -; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 -; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] -; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] -; CHECK-NEXT: [[SHR_I184_3:%.*]] = lshr i32 [[ADD105_3]], 15 -; CHECK-NEXT: [[AND_I185_3:%.*]] = and i32 [[SHR_I184_3]], 65537 -; CHECK-NEXT: [[MUL_I186_3:%.*]] = mul nuw i32 [[AND_I185_3]], 65535 -; CHECK-NEXT: [[ADD_I187_3:%.*]] = add i32 [[MUL_I186_3]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I188_3:%.*]] = xor i32 [[ADD_I187_3]], [[MUL_I186_3]] -; CHECK-NEXT: [[SHR_I189_3:%.*]] = lshr i32 [[SUB104_3]], 15 -; CHECK-NEXT: [[AND_I190_3:%.*]] = and i32 [[SHR_I189_3]], 65537 -; CHECK-NEXT: [[MUL_I191_3:%.*]] = mul nuw i32 [[AND_I190_3]], 65535 -; CHECK-NEXT: [[ADD_I192_3:%.*]] = add i32 [[MUL_I191_3]], [[SUB104_3]] -; CHECK-NEXT: [[XOR_I193_3:%.*]] = xor i32 [[ADD_I192_3]], [[MUL_I191_3]] -; CHECK-NEXT: [[SHR_I194_3:%.*]] = lshr i32 [[SUB106_3]], 15 -; CHECK-NEXT: [[AND_I195_3:%.*]] = and i32 [[SHR_I194_3]], 65537 -; CHECK-NEXT: [[MUL_I196_3:%.*]] = mul nuw i32 [[AND_I195_3]], 65535 -; CHECK-NEXT: [[ADD_I197_3:%.*]] = add i32 [[MUL_I196_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I198_3:%.*]] = xor i32 [[ADD_I197_3]], [[MUL_I196_3]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I188_3]], [[ADD113_2]] -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I193_3]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I198_3]] -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[ADD113_3]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, <4 x i8>* [[TMP12]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX3_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* +; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP37]], <4 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x i8> [[TMP38]], <16 x i8> [[TMP39]], <16 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i8> [[TMP44]], <16 x i8> [[TMP45]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i8> [[TMP46]], <16 x i8> [[TMP47]], <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i8> [[TMP50]], <16 x i8> [[TMP51]], <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP49]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP29]], [[TMP43]] +; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], +; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]] +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]] +; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]] +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> +; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]] +; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]] +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]] +; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]] +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> +; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]] +; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]] +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], +; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], +; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], +; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] +; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -110,12 +110,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[TMP7]], <8 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: