diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -986,10 +986,6 @@ OptimizePM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - // Enhance/cleanup vector code. - OptimizePM.addPass(VectorCombinePass()); - OptimizePM.addPass(EarlyCSEPass()); - // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. OptimizePM.addPass(LoopLoadEliminationPass()); @@ -1016,6 +1012,9 @@ if (PTO.SLPVectorization) OptimizePM.addPass(SLPVectorizerPass()); + // Enhance/cleanup vector code. + OptimizePM.addPass(VectorCombinePass()); + OptimizePM.addPass(EarlyCSEPass()); OptimizePM.addPass(InstCombinePass()); // Unroll small loops to hide loop backedge latency and saturate any parallel diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -741,8 +741,6 @@ MPM.add(createLoopDistributePass()); MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - MPM.add(createVectorCombinePass()); - MPM.add(createEarlyCSEPass()); // Eliminate loads by forwarding stores from the previous iteration to loads // of the current iteration. @@ -783,6 +781,10 @@ } } + // Enhance/cleanup vector code. + MPM.add(createVectorCombinePass()); + MPM.add(createEarlyCSEPass()); + addExtensionsToPM(EP_Peephole, MPM); MPM.add(createInstructionCombiningPass()); diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -230,8 +230,6 @@ ; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Inject TLI Mappings ; GCN-O1-NEXT: Loop Vectorization -; GCN-O1-NEXT: Optimize scalar/vector ops -; GCN-O1-NEXT: Early CSE ; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: Scalar Evolution Analysis ; GCN-O1-NEXT: Function Alias Analysis Results @@ -247,6 +245,8 @@ ; GCN-O1-NEXT: Combine redundant instructions ; GCN-O1-NEXT: Simplify the CFG ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Optimize scalar/vector ops +; GCN-O1-NEXT: Early CSE ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information @@ -571,8 +571,6 @@ ; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Inject TLI Mappings ; GCN-O2-NEXT: Loop Vectorization -; GCN-O2-NEXT: Optimize scalar/vector ops -; GCN-O2-NEXT: Early CSE ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Function Alias Analysis Results @@ -598,6 +596,9 @@ ; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Inject TLI Mappings ; GCN-O2-NEXT: SLP Vectorizer +; GCN-O2-NEXT: Optimize scalar/vector ops +; GCN-O2-NEXT: Early CSE +; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Combine redundant instructions ; GCN-O2-NEXT: Canonicalize natural loops @@ -924,8 +925,6 @@ ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Inject TLI Mappings ; GCN-O3-NEXT: Loop Vectorization -; GCN-O3-NEXT: Optimize scalar/vector ops -; GCN-O3-NEXT: Early CSE ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Function Alias Analysis Results @@ -951,6 +950,9 @@ ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Inject TLI Mappings ; GCN-O3-NEXT: SLP Vectorizer +; GCN-O3-NEXT: Optimize scalar/vector ops +; GCN-O3-NEXT: Early CSE +; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Combine redundant instructions ; GCN-O3-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -253,8 +253,6 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis -; CHECK-O-NEXT: Running pass: VectorCombinePass -; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -262,6 +260,8 @@ ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-defaults.ll @@ -223,8 +223,6 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis -; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass -; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass @@ -232,6 +230,8 @@ ; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -191,8 +191,6 @@ ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass -; CHECK-O-NEXT: Running pass: VectorCombinePass -; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -200,6 +198,8 @@ ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -202,8 +202,6 @@ ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass -; CHECK-O-NEXT: Running pass: VectorCombinePass -; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -211,6 +209,8 @@ ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass +; CHECK-O-NEXT: Running pass: VectorCombinePass +; CHECK-O-NEXT: Running pass: EarlyCSEPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass ; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll --- a/llvm/test/Other/opt-O2-pipeline.ll +++ b/llvm/test/Other/opt-O2-pipeline.ll @@ -227,8 +227,6 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -254,6 +252,9 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll --- a/llvm/test/Other/opt-O3-pipeline.ll +++ b/llvm/test/Other/opt-O3-pipeline.ll @@ -232,8 +232,6 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -259,6 +257,9 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll --- a/llvm/test/Other/opt-Os-pipeline.ll +++ b/llvm/test/Other/opt-Os-pipeline.ll @@ -213,8 +213,6 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization -; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Early CSE ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Function Alias Analysis Results @@ -240,6 +238,9 @@ ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: SLP Vectorizer +; CHECK-NEXT: Optimize scalar/vector ops +; CHECK-NEXT: Early CSE +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Canonicalize natural loops diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll --- a/llvm/test/Other/opt-pipeline-vector-passes.ll +++ b/llvm/test/Other/opt-pipeline-vector-passes.ll @@ -12,15 +12,15 @@ ; OLDPM_O1-LABEL: Pass Arguments: ; OLDPM_O1: Loop Vectorization -; OLDPM_O1: Optimize scalar/vector ops ; OLDPM_O1-NOT: SLP Vectorizer +; OLDPM_O1: Optimize scalar/vector ops ; Everything runs at -O2. ; OLDPM_O2-LABEL: Pass Arguments: ; OLDPM_O2: Loop Vectorization -; OLDPM_O2: Optimize scalar/vector ops ; OLDPM_O2: SLP Vectorizer +; OLDPM_O2: Optimize scalar/vector ops ; The loop vectorizer still runs at both -O1/-O2 even with the ; debug flag, but it only works on loops explicitly annotated @@ -28,24 +28,24 @@ ; OLDPM_O1_FORCE_OFF-LABEL: Pass Arguments: ; OLDPM_O1_FORCE_OFF: Loop Vectorization -; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops ; OLDPM_O1_FORCE_OFF-NOT: SLP Vectorizer +; OLDPM_O1_FORCE_OFF: Optimize scalar/vector ops ; OLDPM_O2_FORCE_OFF-LABEL: Pass Arguments: ; OLDPM_O2_FORCE_OFF: Loop Vectorization -; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops ; OLDPM_O2_FORCE_OFF: SLP Vectorizer +; OLDPM_O2_FORCE_OFF: Optimize scalar/vector ops ; There should be no difference with the new pass manager. ; This is tested more thoroughly in other test files. ; NEWPM_O1-LABEL: Running pass: LoopVectorizePass -; NEWPM_O1: Running pass: VectorCombinePass ; NEWPM_O1-NOT: Running pass: SLPVectorizerPass +; NEWPM_O1: Running pass: VectorCombinePass ; NEWPM_O2-LABEL: Running pass: LoopVectorizePass -; NEWPM_O2: Running pass: VectorCombinePass ; NEWPM_O2: Running pass: SLPVectorizerPass +; NEWPM_O2: Running pass: VectorCombinePass define void @f() { ret void diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -O3 -S | FileCheck %s -; RUN: opt < %s -passes='default' -S | FileCheck %s +; RUN: opt < %s -O3 -S | FileCheck %s --check-prefixes=CHECK,OLDPM +; RUN: opt < %s -passes='default' -S | FileCheck %s --check-prefixes=CHECK,NEWPM target triple = "x86_64--" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -69,22 +69,30 @@ } define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) { -; CHECK-LABEL: @add_aggregate_store( -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0 -; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1 -; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 -; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2 -; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 -; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3 -; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4 -; CHECK-NEXT: ret void +; OLDPM-LABEL: @add_aggregate_store( +; OLDPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]] +; OLDPM-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] +; OLDPM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> +; OLDPM-NEXT: [[TMP4:%.*]] = bitcast %struct.Vector4* [[R:%.*]] to <4 x float>* +; OLDPM-NEXT: store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4 +; OLDPM-NEXT: ret void +; +; NEWPM-LABEL: @add_aggregate_store( +; NEWPM-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]] +; NEWPM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; NEWPM-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0 +; NEWPM-NEXT: store float [[TMP2]], float* [[R0]], align 4 +; NEWPM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; NEWPM-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1 +; NEWPM-NEXT: store float [[TMP3]], float* [[R1]], align 4 +; NEWPM-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]] +; NEWPM-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; NEWPM-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2 +; NEWPM-NEXT: store float [[TMP5]], float* [[R2]], align 4 +; NEWPM-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; NEWPM-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3 +; NEWPM-NEXT: store float [[TMP6]], float* [[R3]], align 4 +; NEWPM-NEXT: ret void ; %a00 = extractelement <2 x float> %a0, i32 0 %b00 = extractelement <2 x float> %b0, i32 0 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/horiz-math.ll @@ -9,18 +9,10 @@ define <4 x float> @hadd_reverse_v4f32(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: @hadd_reverse_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]] -; CHECK-NEXT: [[VECINIT6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP5]], [[B]] -; CHECK-NEXT: [[VECINIT10:%.*]] = shufflevector <4 x float> [[VECINIT6]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[B]] -; CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <4 x float> [[VECINIT10]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINIT14]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %shuffle = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> %shuffle1 = shufflevector <4 x float> %b, <4 x float> %b, <4 x i32> @@ -45,18 +37,11 @@ define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: @reverse_hadd_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], [[A]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP3]], [[A]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], [[B]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP9]], [[B]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP10]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP11]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP4]] ; %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -5,19 +5,15 @@ target triple = "x86_64--" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -; FIXME: This should only need 2 'or' instructions. - define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @ext_ext_or_reduction_v4i32( ; CHECK-NEXT: [[Z:%.*]] = and <4 x i32> [[Y:%.*]], [[X:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[Z]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: ret i32 [[TMP7]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[Z]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret i32 [[TMP1]] ; %z = and <4 x i32> %x, %y %z0 = extractelement <4 x i32> %z, i32 0