diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -847,7 +847,8 @@ for (auto &C : ScalarOptimizerLateEPCallbacks) C(FPM, Level); - FPM.addPass(SimplifyCFGPass()); + FPM.addPass(SimplifyCFGPass( + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true))); FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); @@ -1322,8 +1323,6 @@ // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. - // FIXME: study whether hoisting and/or sinking of common instructions should - // be delayed until after SLP vectorizer. OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -509,7 +509,9 @@ if (RerollLoops) MPM.add(createLoopRerollPass()); - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + // Merge & remove BBs and sink & hoist common instructions. + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true))); // Clean up after everything. MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); @@ -823,8 +825,6 @@ // convert to more optimized IR using more aggressive simplify CFG options. // The extra sinking transform can create larger basic blocks, so do this // before SLP vectorization. - // FIXME: study whether hoisting and/or sinking of common instructions should - // be delayed until after SLP vectorizer. MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() .forwardSwitchCondToPhi(true) .convertSwitchToLookupTable(true) diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll --- a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll +++ b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_cspgo_bar_use.ll @@ -11,9 +11,12 @@ br i1 %tobool, label %if.else, label %if.then, !prof !30 if.then: + ; The calls here ensure that the instructions are not hoisted by SimplifyCFG. + call void @clobber() %0 = load i32, i32* @odd, align 4 %inc = add i32 %0, 1 store i32 %inc, i32* @odd, align 4 + call void @clobber() br label %if.end if.else: @@ -26,6 +29,8 @@ ret void } +declare void @clobber() + define internal fastcc i32 @cond(i32 %i) #1 !prof !29 !PGOFuncName !35 { entry: %rem = srem i32 %i, 2 diff --git a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll --- a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll +++ b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll @@ -103,10 +103,10 @@ ret void } ; CSPGOSUMMARY-LABEL: @foo -; CSPGOSUMMARY: %even.sink{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd -; CSPGOSUMMARY-SAME: !prof ![[BW1_CSPGO_FOO:[0-9]+]] -; CSPGOSUMMARY: %even.sink{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd -; CSPGOSUMMARY-SAME: !prof ![[BW2_CSPGO_FOO:[0-9]+]] +; CSPGOSUMMARY: %odd.sink.i{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd +; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]] +; CSPGOSUMMARY: %odd.sink.i{{[0-9]*}} = select i1 %tobool.i{{[0-9]*}}, i32* @even, i32* @odd +; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]] declare dso_local i32 @bar_m(i32) declare dso_local i32 @bar_m2(i32) @@ -152,5 +152,3 @@ ; CSPGOSUMMARY: {{![0-9]+}} = !{!"MaxFunctionCount", i64 200000} ; CSPGOSUMMARY: {{![0-9]+}} = !{!"NumCounts", i64 23} ; CSPGOSUMMARY-DAG: ![[BW_CSPGO_BAR]] = !{!"branch_weights", i32 100000, i32 100000} -; CSPGOSUMMARY-DAG: ![[BW1_CSPGO_FOO]] = !{!"branch_weights", i32 100000, i32 0} -; CSPGOSUMMARY-DAG: ![[BW2_CSPGO_FOO]] = !{!"branch_weights", i32 0, i32 100000} diff --git a/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll b/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll --- a/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll +++ b/llvm/test/Transforms/PGOProfile/thinlto_cspgo_use.ll @@ -8,6 +8,7 @@ ; RUN: -r=%t1.bc,bar,l \ ; RUN: -r=%t1.bc,main,plx \ ; RUN: -r=%t2.bc,bar,pl \ +; RUN: -r=%t2.bc,clobber,pl \ ; RUN: -r=%t2.bc,odd,pl \ ; RUN: -r=%t2.bc,even,pl ; RUN: llvm-dis %t.1.4.opt.bc -o - | FileCheck %s --check-prefix=CSUSE diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -140,16 +140,61 @@ define void @loop2(float* %A, float* %B, i32* %C, float %x) { ; CHECK-LABEL: @loop2( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 10000 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP6]] to float* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[B]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[C]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND011:%.*]] = icmp ugt float* [[SCEVGEP9]], [[B]] +; CHECK-NEXT: [[BOUND112:%.*]] = icmp ugt float* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT13:%.*]] = and i1 [[BOUND011]], [[BOUND112]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT13]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[LOOP_BODY:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOT0:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 0 +; CHECK-NEXT: [[DOT017:%.*]] = getelementptr inbounds float, float* [[A]], i64 0 +; CHECK-NEXT: [[DOT018:%.*]] = getelementptr inbounds float, float* [[B]], i64 0 +; CHECK-NEXT: [[INDEX_NEXT_0:%.*]] = add i64 0, 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX_NEXT_PHI:%.*]] = phi i64 [ [[INDEX_NEXT_0]], [[VECTOR_PH]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[DOTPHI:%.*]] = phi float* [ [[DOT018]], [[VECTOR_PH]] ], [ [[DOT120:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[DOTPHI21:%.*]] = phi float* [ [[DOT017]], [[VECTOR_PH]] ], [ [[DOT119:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[DOTPHI22:%.*]] = phi i32* [ [[DOT0]], [[VECTOR_PH]] ], [ [[DOT1:%.*]], [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[DOTPHI22]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[DOTPHI21]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !11 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[WIDE_LOAD14]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[DOTPHI]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP5]], [[WIDE_LOAD15]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP5]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[DOTPHI]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP8]], align 4, !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT_PHI]], 10000 +; CHECK-NEXT: br i1 [[TMP9]], label [[EXIT:%.*]], label [[VECTOR_BODY_VECTOR_BODY_CRIT_EDGE]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: vector.body.vector.body_crit_edge: +; CHECK-NEXT: [[DOT1]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX_NEXT_PHI]] +; CHECK-NEXT: [[DOT119]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_PHI]] +; CHECK-NEXT: [[DOT120]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX_NEXT_PHI]] +; CHECK-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX_NEXT_PHI]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY]] ; CHECK: loop.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[IV1]] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[IV1]] ; CHECK-NEXT: [[C_LV:%.*]] = load i32, i32* [[C_GEP]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C_LV]], 20 -; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV1]] +; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV1]] ; CHECK-NEXT: [[A_LV_0:%.*]] = load float, float* [[A_GEP_0]], align 4 -; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X:%.*]] -; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV1]] +; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[A_LV_0]], [[X]] +; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV1]] ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[B_LV:%.*]] = load float, float* [[B_GEP_0]], align 4 @@ -160,7 +205,7 @@ ; CHECK-NEXT: store float [[ADD_SINK]], float* [[B_GEP_0]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999 -; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;