diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3198,10 +3198,42 @@ const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + SmallVector ReuseShuffleIndicies; + SmallVector UniqueValues; + auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, + &UserTreeIdx, + this](const InstructionsState &S) { + // Check that every instruction appears once in this bundle. + DenseMap UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(isa(V) ? -1 + : Res.first->second); + if (Res.second) + UniqueValues.emplace_back(V); + } + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return false; + } + VL = UniqueValues; + } + return true; + }; + InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3210,7 +3242,9 @@ isa( cast(S.OpValue)->getVectorOperandType())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3236,7 +3270,9 @@ (isa(S.MainOp) && !all_of(VL, isVectorLikeInstWithConstOps))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } @@ -3258,7 +3294,9 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -3277,7 +3315,9 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -3288,7 +3328,9 @@ for (Value *V : VL) { if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -3307,28 +3349,8 @@ } // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); - } - size_t NumUniqueScalarValues = UniqueValues.size(); - if (NumUniqueScalarValues == VL.size()) { - ReuseShuffleIndicies.clear(); - } else { - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - VL = UniqueValues; - } + if (!TryToFindDuplicates(S)) + return; auto &BSRef = BlocksSchedules[BB]; if (!BSRef) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,38 +16,13 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] -; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 -; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1 -; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1 -; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1 -; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1 -; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] -; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1 -; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] -; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1 -; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] -; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1 -; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1 -; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1 -; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1 -; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1 -; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1 -; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1 -; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1 -; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 +; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1 +; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]] +; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat(