diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -966,12 +966,15 @@ OptimizePM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + // Enhance/cleanup vector code. + OptimizePM.addPass(VectorCombinePass()); + OptimizePM.addPass(EarlyCSEPass()); + // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. OptimizePM.addPass(LoopLoadEliminationPass()); // Cleanup after the loop optimization passes. - OptimizePM.addPass(VectorCombinePass()); OptimizePM.addPass(InstCombinePass()); // Now that we've formed fast to execute loop structures, we do further @@ -990,10 +993,8 @@ sinkCommonInsts(true))); // Optimize parallel scalar instruction chains into SIMD instructions. - if (PTO.SLPVectorization) { + if (PTO.SLPVectorization) OptimizePM.addPass(SLPVectorizerPass()); - OptimizePM.addPass(VectorCombinePass()); - } OptimizePM.addPass(InstCombinePass()); diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -729,6 +729,8 @@ MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + MPM.add(createVectorCombinePass()); + MPM.add(createEarlyCSEPass()); // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. @@ -739,7 +741,6 @@ // on -O1 and no #pragma is found). Would be good to have these two passes // as function calls, so that we can only pass them when the vectorizer // changed the code. - MPM.add(createVectorCombinePass()); addInstructionCombiningPass(MPM); if (OptLevel > 1 && ExtraVectorizerPasses) { // At higher optimization levels, try to clean up any runtime overlap and @@ -748,7 +749,6 @@ // common computations, hoist loop-invariant aspects out of any outer loop, // and unswitch the runtime checks if possible. Once hoisted, we may have // dead (or speculatable) control flows or more combining opportunities. - MPM.add(createEarlyCSEPass()); MPM.add(createCorrelatedValuePropagationPass()); addInstructionCombiningPass(MPM); MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); @@ -766,7 +766,6 @@ if (SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - MPM.add(createVectorCombinePass()); if (OptLevel > 1 && ExtraVectorizerPasses) { MPM.add(createEarlyCSEPass()); } diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -250,17 +250,15 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis -; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass -; CHECK-O2-NEXT: Running pass: VectorCombinePass -; CHECK-O3-NEXT: Running pass: VectorCombinePass -; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -220,17 +220,15 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis -; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass -; CHECK-POSTLINK-O2-NEXT: Running pass: VectorCombinePass -; CHECK-POSTLINK-O3-NEXT: Running pass: VectorCombinePass -; CHECK-POSTLINK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -188,17 +188,15 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis -; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass -; CHECK-O2-NEXT: Running pass: VectorCombinePass -; CHECK-O3-NEXT: Running pass: VectorCombinePass -; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -199,17 +199,15 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis -; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass -; CHECK-O2-NEXT: Running pass: VectorCombinePass -; CHECK-O3-NEXT: Running pass: VectorCombinePass -; CHECK-Os-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -225,6 +225,8 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -232,7 +234,6 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination -; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -250,8 +251,6 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -230,6 +230,8 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -237,7 +239,6 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination -; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -255,8 +256,6 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -212,6 +212,8 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -219,7 +221,6 @@ ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Loop Load Elimination -; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -237,8 +238,6 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: SLP Vectorizer -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; TODO: Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle. +; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle. ; That may require some coordination between VectorCombine, SLP, and other passes. ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX. @@ -12,11 +12,7 @@ ; CHECK-LABEL: @PR45015( ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]] -; CHECK-NEXT: [[T8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[ARG]], [[ARG1]] -; CHECK-NEXT: [[T12:%.*]] = shufflevector <4 x float> [[T8]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]] -; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[T12]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[T16]] ; %t = extractelement <4 x float> %arg, i32 0 @@ -45,13 +41,9 @@ define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) { ; CHECK-LABEL: @add_aggregate( ; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A0]], [[B0]] -; CHECK-NEXT: [[RETVAL_0_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1]], [[B1]] -; CHECK-NEXT: [[RETVAL_1_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[RETVAL_0_1_INSERT]], 0 -; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[RETVAL_1_1_INSERT]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0 +; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1 ; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]] ; %a00 = extractelement <2 x float> %a0, i32 0 @@ -81,18 +73,16 @@ ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0 ; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A0]], [[B0]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 ; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1 -; CHECK-NEXT: store float [[TMP4]], float* [[R1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2 -; CHECK-NEXT: store float [[TMP6]], float* [[R2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[A1]], [[B1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3 -; CHECK-NEXT: store float [[TMP8]], float* [[R3]], align 4 +; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4 ; CHECK-NEXT: ret void ; %a00 = extractelement <2 x float> %a0, i32 0