diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4592,7 +4592,19 @@ }; InstructionsState S = getSameOpcode(VL); - if (Depth == RecursionMaxDepth) { + + // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of + // a load), in which case peek through to include it in the tree, without + // ballooning over-budget. + if (Depth >= RecursionMaxDepth && + !(S.MainOp && isa(S.MainOp) && S.MainOp == S.AltOp && + VL.size() >= 4 && + (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) { + return match(I, + m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) && + cast(I)->getOpcode() == + cast(S.MainOp)->getOpcode(); + })))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -1239,103 +1239,102 @@ ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, i8* [[P2:%.*]], i64 4 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[P1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, i8* [[P2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* -; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_1]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR_2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR64_2]], i64 4 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[P2]] to <4 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ADD_PTR]] to <4 x i8>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ADD_PTR64]] to <4 x i8>* +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, <4 x i8>* [[TMP12]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ARRAYIDX3]] to <4 x i8>* +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[ADD_PTR_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, <4 x i8>* [[TMP16]], align 1 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX3_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[ADD_PTR64_1]] to <4 x i8>* ; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX3_2]] to <4 x i8>* ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8* [[ARRAYIDX5]] to <4 x i8>* -; CHECK-NEXT: [[TMP31:%.*]] = load <4 x i8>, <4 x i8>* [[TMP30]], align 1 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX5_1]] to <4 x i8>* +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[ADD_PTR_2]] to <4 x i8>* +; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP25]], <4 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i8* [[ADD_PTR64_2]] to <4 x i8>* ; CHECK-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 -; CHECK-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX5_2]] to <4 x i8>* -; CHECK-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; CHECK-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* -; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP37]], <4 x i8> [[TMP35]], <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x i8> [[TMP38]], <16 x i8> [[TMP39]], <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i8> [[TMP40]], <16 x i8> [[TMP41]], <16 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i8> [[TMP44]], <16 x i8> [[TMP45]], <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i8> [[TMP46]], <16 x i8> [[TMP47]], <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i8> [[TMP50]], <16 x i8> [[TMP51]], <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = zext <16 x i8> [[TMP54]] to <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP49]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP29]], [[TMP43]] +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP33]], <4 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = sub nsw <16 x i32> [[TMP31]], [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = bitcast i8* [[ARRAYIDX3_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, <4 x i8>* [[TMP41]], align 1 +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <4 x i8> [[TMP42]], <4 x i8> [[TMP21]], <16 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = bitcast i8* [[ARRAYIDX5_3]] to <4 x i8>* +; CHECK-NEXT: [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1 +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP50]], <4 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = sub nsw <16 x i32> [[TMP48]], [[TMP56]] ; CHECK-NEXT: [[TMP58:%.*]] = shl nsw <16 x i32> [[TMP57]], -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP56]] -; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub nsw <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub nsw <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub nsw <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], -; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = mul nuw <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP76]] -; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP79]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) -; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP82]], 65535 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP82]], 16 +; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i32> [[TMP58]], [[TMP40]] +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = add nsw <16 x i32> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = add nsw <16 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = sub nsw <16 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = lshr <16 x i32> [[TMP75]], +; CHECK-NEXT: [[TMP77:%.*]] = and <16 x i32> [[TMP76]], +; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <16 x i32> [[TMP77]], +; CHECK-NEXT: [[TMP79:%.*]] = add <16 x i32> [[TMP78]], [[TMP75]] +; CHECK-NEXT: [[TMP80:%.*]] = xor <16 x i32> [[TMP79]], [[TMP78]] +; CHECK-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP80]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP81]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP81]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]