diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9549,6 +9549,9 @@ SmallVector ShuffledInserts; // Maps vector instruction to original insertelement instruction DenseMap VectorToInsertElement; + // Maps extract Scalar to the corresponding extractelement instruction in the + // basic block. Only one extractelement per block should be emitted. + DenseMap> ScalarToEEs; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -9573,13 +9576,29 @@ Value *Lane = Builder.getInt32(ExternalUse.Lane); auto ExtractAndExtendIfNeeded = [&](Value *Vec) { if (Scalar->getType() != Vec->getType()) { - Value *Ex; - // "Reuse" the existing extract to improve final codegen. - if (auto *ES = dyn_cast(Scalar)) { - Ex = Builder.CreateExtractElement(ES->getOperand(0), - ES->getOperand(1)); - } else { - Ex = Builder.CreateExtractElement(Vec, Lane); + Value *Ex = nullptr; + auto It = ScalarToEEs.find(Scalar); + if (It != ScalarToEEs.end()) { + // No need to emit many extracts, just move the only one in the + // current block. + auto EEIt = It->second.find(Builder.GetInsertBlock()); + if (EEIt != It->second.end()) { + auto *I = cast(EEIt->second); + if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && + Builder.GetInsertPoint()->comesBefore(I)) + I->moveBefore(&*Builder.GetInsertPoint()); + Ex = I; + } + } + if (!Ex) { + // "Reuse" the existing extract to improve final codegen. + if (auto *ES = dyn_cast(Scalar)) { + Ex = Builder.CreateExtractElement(ES->getOperand(0), + ES->getOperand(1)); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } + ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), Ex); } // The then branch of the previous if may produce constants, since 0 // operand might be a constant. @@ -9610,8 +9629,11 @@ "Scalar with nullptr as an external user must be registered in " "ExternallyUsedValues map"); if (auto *VecI = dyn_cast(Vec)) { - Builder.SetInsertPoint(VecI->getParent(), - std::next(VecI->getIterator())); + if (auto *PHI = dyn_cast(VecI)) + Builder.SetInsertPoint(PHI->getParent()->getFirstNonPHI()); + else + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -75,13 +75,13 @@ ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], +; SSE-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], ; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* @@ -99,13 +99,13 @@ ; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], +; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> +; AVX-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], +; AVX-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> ; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], ; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] ; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*