diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -29,7 +29,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -182,7 +181,8 @@ // consisting of instructions exclusively computed for producing the operands // of the source instruction. void getExclBackwardsSlice(Instruction *I, std::stack &Slice, - bool ForSinking = false); + bool ForSinking = false, + Instruction *SinkPt = nullptr); // Returns true if the condition of the select is highly predictable. bool isSelectHighlyPredictable(const SelectInst *SI); @@ -377,13 +377,13 @@ // false operands. if (auto *TI = dyn_cast(SI->getTrueValue())) { std::stack TrueSlice; - getExclBackwardsSlice(TI, TrueSlice, true); + getExclBackwardsSlice(TI, TrueSlice, true, SI); maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size()); TrueSlices.push_back(TrueSlice); } if (auto *FI = dyn_cast(SI->getFalseValue())) { std::stack FalseSlice; - getExclBackwardsSlice(FI, FalseSlice, true); + getExclBackwardsSlice(FI, FalseSlice, true, SI); maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size()); FalseSlices.push_back(FalseSlice); } @@ -470,22 +470,6 @@ auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); FalseBranch->setDebugLoc(SI->getDebugLoc()); } - // If there was sinking, move any lifetime-end intrinsic calls found in the - // StartBlock to the newly-created end block to ensure sound lifetime info - // after sinking (in case any use is sinked). - else { - SmallVector EndLifetimeCalls; - for (Instruction &I : *StartBlock) { - if (IntrinsicInst *II = dyn_cast(&I)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_end) { - EndLifetimeCalls.push_back(&I); - } - } - } - for (auto *LC : EndLifetimeCalls) { - LC->moveBefore(&*EndBlock->getFirstInsertionPt()); - } - } // Insert the real conditional branch based on the original condition. // If we did not create a new block for one of the 'true' or 'false' paths @@ -721,6 +705,22 @@ return false; } +// Check if it is safe to move LoadI to the SinkPt. +// Conservatively assume it is safe only if there is no instruction +// modifying memory in-between the load and the sink point. +static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SinkPt) { + // Ensure that the instructions are on the same basic block. + if (LoadI->getParent() != SinkPt->getParent()) + return false; + auto It = LoadI->getIterator(); + while (&*It != SinkPt) { + if (It->mayWriteToMemory()) + return false; + It++; + } + return true; +} + // For a given source instruction, collect its backwards dependence slice // consisting of instructions exclusively computed for the purpose of producing // the operands of the source instruction. As an approximation @@ -729,7 +729,8 @@ // form an one-use chain that leads to the source instruction. void SelectOptimize::getExclBackwardsSlice(Instruction *I, std::stack &Slice, - bool ForSinking) { + bool ForSinking, + Instruction *SinkPt) { SmallPtrSet Visited; std::queue Worklist; Worklist.push(I); @@ -751,6 +752,18 @@ isa(II) || isa(II))) continue; + // Avoid sinking loads in order not to skip state-modifying instructions, + // that may alias with the loaded address. + // Only allow sinking of loads within the same basic block that are + // conservatively proven to be safe. + if (ForSinking && II->mayReadFromMemory() && SinkPt) { + if (II->getParent() != SinkPt->getParent()) { + continue; + } else if (!isSafeToSinkLoad(II, SinkPt)) { + continue; + } + } + // Avoid considering instructions with less frequency than the source // instruction (i.e., avoid colder code regions of the dependence slice). if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent())) diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -185,9 +185,8 @@ ret i32 %sel } -; Cold value operand with load in its one-use dependence slice shoud result +; Cold value operand with load in its one-use dependence slice should result ; into a branch with sinked dependence slice. -; The lifetime-end intrinsics should be soundly preserved. define i32 @expensive_val_operand3(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand3( ; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] @@ -198,19 +197,64 @@ ; CHECK-NEXT: br label [[SELECT_END]] ; CHECK: select.end: ; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]]) ; CHECK-NEXT: ret i32 [[SEL]] ; %load = load i32, ptr %a, align 8 - call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a) %x = add i32 %load, %b %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17 ret i32 %sel } -; Multiple uses of the load value operand. -define i32 @expensive_val_operand4(i32 %a, ptr nocapture %b, i32 %x, i1 %cmp) { +; Expensive cold value operand with unsafe-to-sink load (partial slice sinking). +define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand4( +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8 +; CHECK-NEXT: call void @free(ptr [[A]]) +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]] +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] +; CHECK-NEXT: ret i32 [[SEL]] +; + %load = load i32, ptr %a, align 8 + call void @free(ptr %a) + %x = add i32 %load, %b + %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17 + ret i32 %sel +} + +; Expensive cold value operand with potentially-unsafe-to-sink load (located +; in a different basic block and thus unchecked for sinkability). +define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] +; CHECK-NEXT: br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] +; CHECK: select.true.sink: +; CHECK-NEXT: [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]] +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[BB1]] ] +; CHECK-NEXT: ret i32 [[SEL]] +; +entry: + %load = load i32, ptr %a, align 8 + br label %bb1 +bb1: ; preds = %entry + %x = add i32 %load, %b + %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17 + ret i32 %sel +} + +; Multiple uses of the load value operand. +define i32 @expensive_val_operand6(i32 %a, ptr nocapture %b, i32 %x, i1 %cmp) { +; CHECK-LABEL: @expensive_val_operand6( ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]] ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[LOAD]] @@ -450,8 +494,7 @@ ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.value(metadata, metadata, metadata) -; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) +declare void @free(ptr nocapture) !llvm.module.flags = !{!0, !26, !27} !0 = !{i32 1, !"ProfileSummary", !1}