diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h --- a/llvm/include/llvm/Transforms/Scalar/GVN.h +++ b/llvm/include/llvm/Transforms/Scalar/GVN.h @@ -331,6 +331,11 @@ bool PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); + // Try to replace a load which executes on each loop iteration and is fed by a + // pointer-phi/select recurrence with a load in the pre-header. The + // pointer-phi/select recurrence is replaced by a value-based recurrence. + bool performLoopLoadPREThroughSelect(LoadInst *Load); + /// Try to replace a load which executes on each loop iteraiton with Phi /// translation of load in preheader and load(s) in conditionally executed /// paths. diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1517,6 +1517,149 @@ return true; } +bool GVNPass::performLoopLoadPREThroughSelect(LoadInst *Load) { + if (!LI) + return false; + Loop *L = LI->getLoopFor(Load->getParent()); + if (!L) + return false; + + // Start by looking for a load that uses a pointer phi, which forms a select + // recurrence and where the other pointer in the select is also loaded from + // before the select. + auto *Latch = L->getLoopLatch(); + if (!Latch || Latch != L->getHeader()) + return false; + + // We plan to hoist the load to preheader without introducing a new fault. + // In order to do it, we need to prove that we cannot side-exit the loop + // once loop header is first entered before execution of the load. + if (ICF->isDominatedByICFIFromSameBlock(Load)) + return false; + + if (any_of(*L->getHeader(), + [](const Instruction &I) { return I.mayWriteToMemory(); })) + return false; + + auto *PtrPhi = dyn_cast(Load->getPointerOperand()); + if (!PtrPhi || PtrPhi->getParent() != L->getHeader()) + return false; + auto *Sel = dyn_cast(PtrPhi->getIncomingValueForBlock(Latch)); + if (!Sel) + return false; + + Value *Recur = Sel->getOperand(1); + Value *Cur = Sel->getOperand(2); + if (Recur != PtrPhi) + std::swap(Recur, Cur); + + if (Recur != PtrPhi) + return false; + + auto FindDominatingLoad = [Sel, L, this](Value *Ptr) -> LoadInst * { + for (Value *U : Ptr->users()) { + auto *LI = dyn_cast(U); + if (LI && L->contains(LI) && DT->dominates(LI, Sel)) + return LI; + } + return nullptr; + }; + + LoadInst *PhiLoad = FindDominatingLoad(PtrPhi); + LoadInst *IterLoad = FindDominatingLoad(Cur); + if (!PhiLoad || !IterLoad) + return false; + + // PtrPhi must only be used by PhiLoad and the select. + if (any_of(PtrPhi->users(), [PhiLoad, Sel](const User *U) { + return U != PhiLoad && U != Sel; + })) + return false; + + // Now check the uses of the pointer outside the loop. The only real user must + // be a load that is guaranteed to execute when the loop exits. + PHINode *ExitPhi = nullptr; + for (Value *U : Sel->users()) { + auto *I = cast(U); + if (L->contains(I)) { + if (I != PtrPhi) + return false; + continue; + } + + auto *PN = dyn_cast(I); + if (!PN || ExitPhi) + return false; + ExitPhi = PN; + } + + if (!ExitPhi || ExitPhi->getNumIncomingValues() != 1) + return false; + auto LastLI = dyn_cast(*ExitPhi->user_begin()); + auto ExitBlock = L->getExitBlock(); + if (!LastLI || ExitPhi->getParent() != LastLI->getParent() || + ExitPhi->getParent() != ExitBlock) + return false; + + for (Instruction &I : make_range(ExitBlock->begin(), LastLI->getIterator())) { + if (!isGuaranteedToTransferExecutionToSuccessor(&I) || I.mayWriteToMemory()) + return false; + } + + // The loop is of the form below. + // loop: + // %sel.phi = phi i32* [ %start, %ph ], [ %sel, %ph ] + // %l = load %ptr + // %l.sel = load %sel.phi + // %sel = select cond, %ptr, %sel.phi + // ... + // + // exit: + // %res = load %sel + // use(%res) + // + // Now, transform it to + // + // %l.start = load %start + // loop: + // sel.phi.prom = phi i32 [ %l.start, %ph ], [ %sel.prom, %ph ] + // %l = load %ptr + // %sel.prom = select cond, %l, %sel.phi.prom + // ... + // exit: + // use(%sel.prom) + + // First, add a new load of the start pointer in the preheader. + BasicBlock *PH = L->getLoopPreheader(); + LoadInst *PreheaderLoad = + new LoadInst(PhiLoad->getType(), PtrPhi->getIncomingValueForBlock(PH), + PhiLoad->getName() + ".promoted", PH->getTerminator()); + if (MSSAU) { + MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( + PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); + MSSAU->insertUse(cast(NewMemAcc), /*RenameUses=*/true); + } + + // Next, create the value phi and the select for the recurrence. + PHINode *ValuePhi = PHINode::Create(PhiLoad->getType(), 2, + PhiLoad->getName() + ".promoted.load", + &*L->getHeader()->begin()); + Value *NewSel = + SelectInst::Create(Sel->getCondition(), IterLoad, ValuePhi, "", Sel); + ValuePhi->addIncoming(PreheaderLoad, PH); + ValuePhi->addIncoming(NewSel, L->getLoopLatch()); + PhiLoad->replaceAllUsesWith(ValuePhi); + + // Finally, replace phi nodes referencing the pointer select with a phi node + // for the selected value. + PHINode *ExitValuePhi = PHINode::Create( + LastLI->getType(), 2, LastLI->getName() + ".promoted.load", ExitPhi); + ExitValuePhi->addIncoming(NewSel, ExitPhi->getIncomingBlock(0)); + LastLI->replaceAllUsesWith(ExitValuePhi); + markInstructionForDeletion(PhiLoad); + return true; +} + bool GVNPass::performLoopLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks) { @@ -1654,8 +1797,12 @@ // If we have no predecessors that produce a known value for this load, exit // early. - if (ValuesPerBlock.empty()) + if (ValuesPerBlock.empty()) { + if (performLoopLoadPREThroughSelect(Load)) { + return true; + } return Changed; + } // Step 3: Eliminate fully redundancy. // diff --git a/llvm/test/Transforms/GVN/PRE/pre-loop-load-through-select.ll b/llvm/test/Transforms/GVN/PRE/pre-loop-load-through-select.ll --- a/llvm/test/Transforms/GVN/PRE/pre-loop-load-through-select.ll +++ b/llvm/test/Transforms/GVN/PRE/pre-loop-load-through-select.ll @@ -5,20 +5,21 @@ ; CHECK-LABEL: @test_pointer_phi_select_same_object( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1 +; CHECK-NEXT: [[L_2_PROMOTED:%.*]] = load i32, i32* [[PTR]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[L_2_PROMOTED_LOAD:%.*]] = phi i32 [ [[L_2_PROMOTED]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[MIN_PTR:%.*]] = phi i32* [ [[PTR]], [[ENTRY]] ], [ [[MIN_SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[MIN_PTR]], align 4 -; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2]] +; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2_PROMOTED_LOAD]] +; CHECK-NEXT: [[TMP0]] = select i1 [[CMP_I_I_I]], i32 [[L_1]], i32 [[L_2_PROMOTED_LOAD]] ; CHECK-NEXT: [[MIN_SELECT]] = select i1 [[CMP_I_I_I]], i32* [[PTR_IV]], i32* [[MIN_PTR]] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32* [[PTR_IV_NEXT]], [[END:%.*]] ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[MIN_SELECT]], align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: %start.ptr = getelementptr inbounds i32, i32* %ptr, i64 1 @@ -44,20 +45,21 @@ ; CHECK-LABEL: @test_pointer_phi_select_same_object_lcssa( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1 +; CHECK-NEXT: [[L_2_PROMOTED:%.*]] = load i32, i32* [[PTR]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[L_2_PROMOTED_LOAD:%.*]] = phi i32 [ [[L_2_PROMOTED]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[MIN_PTR:%.*]] = phi i32* [ [[PTR]], [[ENTRY]] ], [ [[MIN_SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[MIN_PTR]], align 4 -; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2]] +; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2_PROMOTED_LOAD]] +; CHECK-NEXT: [[TMP0]] = select i1 [[CMP_I_I_I]], i32 [[L_1]], i32 [[L_2_PROMOTED_LOAD]] ; CHECK-NEXT: [[MIN_SELECT]] = select i1 [[CMP_I_I_I]], i32* [[PTR_IV]], i32* [[MIN_PTR]] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32* [[PTR_IV_NEXT]], [[END:%.*]] ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[MIN_SELECT]], align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: %start.ptr = getelementptr inbounds i32, i32* %ptr, i64 1 @@ -83,20 +85,21 @@ define i32 @test_pointer_phi_select_different_objects(i32* %A, i32 *%B, i32* %end) { ; CHECK-LABEL: @test_pointer_phi_select_different_objects( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_2_PROMOTED:%.*]] = load i32, i32* [[B:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[MIN_PTR:%.*]] = phi i32* [ [[B:%.*]], [[ENTRY]] ], [ [[MIN_SELECT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[L_2_PROMOTED_LOAD:%.*]] = phi i32 [ [[L_2_PROMOTED]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[A:%.*]], [[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[MIN_PTR:%.*]] = phi i32* [ [[B]], [[ENTRY]] ], [ [[MIN_SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[MIN_PTR]], align 4 -; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2]] +; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2_PROMOTED_LOAD]] +; CHECK-NEXT: [[TMP0]] = select i1 [[CMP_I_I_I]], i32 [[L_1]], i32 [[L_2_PROMOTED_LOAD]] ; CHECK-NEXT: [[MIN_SELECT]] = select i1 [[CMP_I_I_I]], i32* [[PTR_IV]], i32* [[MIN_PTR]] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32* [[PTR_IV_NEXT]], [[END:%.*]] ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[MIN_SELECT]], align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: br label %loop @@ -161,20 +164,21 @@ ; CHECK-LABEL: @test_pointer_phi_select_same_object_multiple_loads_2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1 +; CHECK-NEXT: [[L_2_PROMOTED:%.*]] = load i32, i32* [[PTR]], align 4 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[L_2_PROMOTED_LOAD:%.*]] = phi i32 [ [[L_2_PROMOTED]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi i32* [ [[START_PTR]], [[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[MIN_PTR:%.*]] = phi i32* [ [[PTR]], [[ENTRY]] ], [ [[MIN_SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[PTR_IV]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[MIN_PTR]], align 4 -; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2]] +; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ult i32 [[L_1]], [[L_2_PROMOTED_LOAD]] +; CHECK-NEXT: [[TMP0]] = select i1 [[CMP_I_I_I]], i32 [[L_1]], i32 [[L_2_PROMOTED_LOAD]] ; CHECK-NEXT: [[MIN_SELECT]] = select i1 [[CMP_I_I_I]], i32* [[PTR_IV]], i32* [[MIN_PTR]] ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i32, i32* [[PTR_IV]], i64 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32* [[PTR_IV_NEXT]], [[END:%.*]] ; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = load i32, i32* [[MIN_SELECT]], align 4 -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: %start.ptr = getelementptr inbounds i32, i32* %ptr, i64 1