diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1542,6 +1542,14 @@ getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const; + /// Checks if the gathered \p VL can be represented as shuffle(s) of previous + /// tree entries. + /// \returns ShuffleKind, if gathered values can be represented as shuffles of + /// previous tree entries. \p Mask is filled with the shuffle mask. + Optional + isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + SmallVectorImpl &Entries); + /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. @@ -3560,7 +3568,27 @@ return ReuseShuffleCost + Cost; } } - return ReuseShuffleCost + getGatherCost(VL); + InstructionCost GatherCost = 0; + SmallVector Mask; + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + if (ShuffleVectorInst::isIdentityMask(Mask)) { + LLVM_DEBUG( + dbgs() + << "SLP: perfect diamond match for gather bundle that starts with " + << *VL.front() << ".\n"); + } else { + LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() + << " entries for bundle that starts with " + << *VL.front() << ".\n"); + GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); + } + } else { + GatherCost = getGatherCost(VL); + } + return ReuseShuffleCost + GatherCost; } assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && @@ -4216,6 +4244,68 @@ return Cost; } +Optional +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl &Mask, + SmallVectorImpl &Entries) { + auto *VLIt = find_if(VectorizableTree, + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr.get() == TE; + }); + assert(VLIt != VectorizableTree.end() && + "Gathered values should be in the tree."); + Mask.clear(); + Entries.clear(); + DenseMap Used; + int NumShuffles = 0; + for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { + Value *V = TE->Scalars[I]; + if (const TreeEntry *VTE = getTreeEntry(V)) { + auto Res = Used.try_emplace(VTE, NumShuffles); + if (Res.second) { + Entries.push_back(VTE); + ++NumShuffles; + } + Mask.push_back( + Res.first->second * E + + std::distance(VTE->Scalars.begin(), find(VTE->Scalars, V))); + continue; + } + // Check if it is used in one of the gathered entries. + const auto *It = + find_if(make_range(VectorizableTree.begin(), VLIt), + [V](const std::unique_ptr &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + is_contained(EntryPtr->Scalars, V); + }); + if (It != VLIt) { + const TreeEntry *VTE = It->get(); + auto Res = Used.try_emplace(VTE, NumShuffles); + if (Res.second) { + Entries.push_back(VTE); + ++NumShuffles; + } + Mask.push_back( + Res.first->second * E + + std::distance(VTE->Scalars.begin(), find(VTE->Scalars, V))); + continue; + } + return None; + } + if (NumShuffles == 1) { + if (ShuffleVectorInst::isReverseMask(Mask)) + return TargetTransformInfo::SK_Reverse; + return TargetTransformInfo::SK_PermuteSingleSrc; + } + if (NumShuffles == 2) { + if (ShuffleVectorInst::isSelectMask(Mask)) + return TargetTransformInfo::SK_Select; + if (ShuffleVectorInst::isTransposeMask(Mask)) + return TargetTransformInfo::SK_Transpose; + return TargetTransformInfo::SK_PermuteTwoSrc; + } + return None; +} + InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const DenseSet &ShuffledIndices) const { @@ -4499,7 +4589,24 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - Value *Vec = gather(E->Scalars); + Value *Vec; + SmallVector Mask; + SmallVector Entries; + Optional Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + if (Entries.size() == 1) { + Vec = + Builder.CreateShuffleVector(Entries.front()->VectorizedValue, Mask); + } else { + assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries."); + Vec = + Builder.CreateShuffleVector(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + } + } else { + Vec = gather(E->Scalars); + } if (NeedToShuffleReuses) { ShuffleBuilder.addMask(E->ReuseShuffleIndices); Vec = ShuffleBuilder.finalize(Vec); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,7 +10,7 @@ ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-7' +; REMARK-NEXT: - Cost: '-16' ; ; REMARK-NOT: Function: gather_load diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -4,124 +4,52 @@ define i32 @bar() local_unnamed_addr { ; CHECK-LABEL: @bar( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD103:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB104:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD105:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB106:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[ADD103]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535 -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]] -; CHECK-NEXT: [[SHR_I64:%.*]] = lshr i32 [[ADD105]], 15 -; CHECK-NEXT: [[AND_I65:%.*]] = and i32 [[SHR_I64]], 65537 -; CHECK-NEXT: [[MUL_I66:%.*]] = mul nuw i32 [[AND_I65]], 65535 -; CHECK-NEXT: [[ADD_I67:%.*]] = add i32 [[MUL_I66]], [[ADD105]] -; CHECK-NEXT: [[XOR_I68:%.*]] = xor i32 [[ADD_I67]], [[MUL_I66]] -; CHECK-NEXT: [[SHR_I69:%.*]] = lshr i32 [[SUB104]], 15 -; CHECK-NEXT: [[AND_I70:%.*]] = and i32 [[SHR_I69]], 65537 -; CHECK-NEXT: [[MUL_I71:%.*]] = mul nuw i32 [[AND_I70]], 65535 -; CHECK-NEXT: [[ADD_I72:%.*]] = add i32 [[MUL_I71]], [[SUB104]] -; CHECK-NEXT: [[XOR_I73:%.*]] = xor i32 [[ADD_I72]], [[MUL_I71]] -; CHECK-NEXT: [[SHR_I74:%.*]] = lshr i32 [[SUB106]], 15 -; CHECK-NEXT: [[AND_I75:%.*]] = and i32 [[SHR_I74]], 65537 -; CHECK-NEXT: [[MUL_I76:%.*]] = mul nuw i32 [[AND_I75]], 65535 -; CHECK-NEXT: [[ADD_I77:%.*]] = add i32 [[MUL_I76]], [[SUB106]] -; CHECK-NEXT: [[XOR_I78:%.*]] = xor i32 [[ADD_I77]], [[MUL_I76]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I68]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I73]] -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I78]] ; CHECK-NEXT: [[ADD78_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB86_1:%.*]] = sub nsw i32 undef, undef ; CHECK-NEXT: [[ADD94_1:%.*]] = add nsw i32 undef, undef ; CHECK-NEXT: [[SUB102_1:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_1:%.*]] = add nsw i32 [[ADD94_1]], [[ADD78_1]] -; CHECK-NEXT: [[SUB104_1:%.*]] = sub nsw i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add nsw i32 [[SUB102_1]], [[SUB86_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub nsw i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[ADD103_1]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul nuw i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[MUL_I_1]] -; CHECK-NEXT: [[SHR_I64_1:%.*]] = lshr i32 [[ADD105_1]], 15 -; CHECK-NEXT: [[AND_I65_1:%.*]] = and i32 [[SHR_I64_1]], 65537 -; CHECK-NEXT: [[MUL_I66_1:%.*]] = mul nuw i32 [[AND_I65_1]], 65535 -; CHECK-NEXT: [[ADD_I67_1:%.*]] = add i32 [[MUL_I66_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I68_1:%.*]] = xor i32 [[ADD_I67_1]], [[MUL_I66_1]] -; CHECK-NEXT: [[SHR_I69_1:%.*]] = lshr i32 [[SUB104_1]], 15 -; CHECK-NEXT: [[AND_I70_1:%.*]] = and i32 [[SHR_I69_1]], 65537 -; CHECK-NEXT: [[MUL_I71_1:%.*]] = mul nuw i32 [[AND_I70_1]], 65535 -; CHECK-NEXT: [[ADD_I72_1:%.*]] = add i32 [[MUL_I71_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I73_1:%.*]] = xor i32 [[ADD_I72_1]], [[MUL_I71_1]] -; CHECK-NEXT: [[SHR_I74_1:%.*]] = lshr i32 [[SUB106_1]], 15 -; CHECK-NEXT: [[AND_I75_1:%.*]] = and i32 [[SHR_I74_1]], 65537 -; CHECK-NEXT: [[MUL_I76_1:%.*]] = mul nuw i32 [[AND_I75_1]], 65535 -; CHECK-NEXT: [[ADD_I77_1:%.*]] = add i32 [[MUL_I76_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I78_1:%.*]] = xor i32 [[ADD_I77_1]], [[MUL_I76_1]] -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I68_1]], [[ADD113]] -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I73_1]] -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I78_1]] ; CHECK-NEXT: [[ADD78_2:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_2:%.*]] = add nsw i32 undef, [[ADD78_2]] -; CHECK-NEXT: [[SUB104_2:%.*]] = sub nsw i32 [[ADD78_2]], undef -; CHECK-NEXT: [[ADD105_2:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB106_2:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[ADD103_2]], 15 -; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 -; CHECK-NEXT: [[MUL_I_2:%.*]] = mul nuw i32 [[AND_I_2]], 65535 -; CHECK-NEXT: [[ADD_I_2:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] -; CHECK-NEXT: [[XOR_I_2:%.*]] = xor i32 [[ADD_I_2]], [[MUL_I_2]] -; CHECK-NEXT: [[SHR_I64_2:%.*]] = lshr i32 [[ADD105_2]], 15 -; CHECK-NEXT: [[AND_I65_2:%.*]] = and i32 [[SHR_I64_2]], 65537 -; CHECK-NEXT: [[MUL_I66_2:%.*]] = mul nuw i32 [[AND_I65_2]], 65535 -; CHECK-NEXT: [[ADD_I67_2:%.*]] = add i32 [[MUL_I66_2]], [[ADD105_2]] -; CHECK-NEXT: [[XOR_I68_2:%.*]] = xor i32 [[ADD_I67_2]], [[MUL_I66_2]] -; CHECK-NEXT: [[SHR_I69_2:%.*]] = lshr i32 [[SUB104_2]], 15 -; CHECK-NEXT: [[AND_I70_2:%.*]] = and i32 [[SHR_I69_2]], 65537 -; CHECK-NEXT: [[MUL_I71_2:%.*]] = mul nuw i32 [[AND_I70_2]], 65535 -; CHECK-NEXT: [[ADD_I72_2:%.*]] = add i32 [[MUL_I71_2]], [[SUB104_2]] -; CHECK-NEXT: [[XOR_I73_2:%.*]] = xor i32 [[ADD_I72_2]], [[MUL_I71_2]] -; CHECK-NEXT: [[SHR_I74_2:%.*]] = lshr i32 [[SUB106_2]], 15 -; CHECK-NEXT: [[AND_I75_2:%.*]] = and i32 [[SHR_I74_2]], 65537 -; CHECK-NEXT: [[MUL_I76_2:%.*]] = mul nuw i32 [[AND_I75_2]], 65535 -; CHECK-NEXT: [[ADD_I77_2:%.*]] = add i32 [[MUL_I76_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I78_2:%.*]] = xor i32 [[ADD_I77_2]], [[MUL_I76_2]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I68_2]], [[ADD113_1]] -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[XOR_I_2]] -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[XOR_I73_2]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I78_2]] ; CHECK-NEXT: [[SUB102_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD103_3:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: [[SUB104_3:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[ADD105_3:%.*]] = add nsw i32 [[SUB102_3]], undef -; CHECK-NEXT: [[SUB106_3:%.*]] = sub nsw i32 undef, [[SUB102_3]] -; CHECK-NEXT: [[SHR_I_3:%.*]] = lshr i32 [[ADD103_3]], 15 -; CHECK-NEXT: [[AND_I_3:%.*]] = and i32 [[SHR_I_3]], 65537 -; CHECK-NEXT: [[MUL_I_3:%.*]] = mul nuw i32 [[AND_I_3]], 65535 -; CHECK-NEXT: [[ADD_I_3:%.*]] = add i32 [[MUL_I_3]], [[ADD103_3]] -; CHECK-NEXT: [[XOR_I_3:%.*]] = xor i32 [[ADD_I_3]], [[MUL_I_3]] -; CHECK-NEXT: [[SHR_I64_3:%.*]] = lshr i32 [[ADD105_3]], 15 -; CHECK-NEXT: [[AND_I65_3:%.*]] = and i32 [[SHR_I64_3]], 65537 -; CHECK-NEXT: [[MUL_I66_3:%.*]] = mul nuw i32 [[AND_I65_3]], 65535 -; CHECK-NEXT: [[ADD_I67_3:%.*]] = add i32 [[MUL_I66_3]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I68_3:%.*]] = xor i32 [[ADD_I67_3]], [[MUL_I66_3]] -; CHECK-NEXT: [[SHR_I69_3:%.*]] = lshr i32 [[SUB104_3]], 15 -; CHECK-NEXT: [[AND_I70_3:%.*]] = and i32 [[SHR_I69_3]], 65537 -; CHECK-NEXT: [[MUL_I71_3:%.*]] = mul nuw i32 [[AND_I70_3]], 65535 -; CHECK-NEXT: [[ADD_I72_3:%.*]] = add i32 [[MUL_I71_3]], [[SUB104_3]] -; CHECK-NEXT: [[XOR_I73_3:%.*]] = xor i32 [[ADD_I72_3]], [[MUL_I71_3]] -; CHECK-NEXT: [[SHR_I74_3:%.*]] = lshr i32 [[SUB106_3]], 15 -; CHECK-NEXT: [[AND_I75_3:%.*]] = and i32 [[SHR_I74_3]], 65537 -; CHECK-NEXT: [[MUL_I76_3:%.*]] = mul nuw i32 [[AND_I75_3]], 65535 -; CHECK-NEXT: [[ADD_I77_3:%.*]] = add i32 [[MUL_I76_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I78_3:%.*]] = xor i32 [[ADD_I77_3]], [[MUL_I76_3]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I68_3]], [[ADD113_2]] -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[XOR_I_3]] -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[XOR_I73_3]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I78_3]] -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD113_3]], 16 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[SUB102_3]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 undef, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[SUB102_1]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 undef, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 undef, i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 undef, i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 7 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[ADD78_1]], i32 8 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB86_1]], i32 9 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 undef, i32 10 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[ADD78_2]], i32 11 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 undef, i32 12 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 undef, i32 13 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 undef, i32 14 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 undef, i32 15 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 undef, i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 undef, i32 4 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 undef, i32 5 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 undef, i32 6 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[ADD78_1]], i32 7 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[ADD94_1]], i32 8 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[SUB102_1]], i32 9 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[ADD78_2]], i32 10 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 undef, i32 11 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 undef, i32 12 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 undef, i32 13 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 undef, i32 14 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[SUB102_3]], i32 15 +; CHECK-NEXT: [[TMP30:%.*]] = add nsw <16 x i32> [[TMP15]], [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = sub nsw <16 x i32> [[TMP15]], [[TMP29]] +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i32> [[TMP30]], <16 x i32> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = lshr <16 x i32> [[TMP32]], +; CHECK-NEXT: [[TMP34:%.*]] = and <16 x i32> [[TMP33]], +; CHECK-NEXT: [[TMP35:%.*]] = mul nuw <16 x i32> [[TMP34]], +; CHECK-NEXT: [[TMP36:%.*]] = add <16 x i32> [[TMP35]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = xor <16 x i32> [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP37]]) +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP38]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]]