Index: llvm/include/llvm/Transforms/Scalar/GVN.h =================================================================== --- llvm/include/llvm/Transforms/Scalar/GVN.h +++ llvm/include/llvm/Transforms/Scalar/GVN.h @@ -330,6 +330,11 @@ AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); + /// Given a critical edge from Pred to LoadBB, find a load instruction + /// which is identical to Load from another successor of Pred. + LoadInst *findLoadToHoistIntoPred(BasicBlock *Pred, BasicBlock *LoadBB, + LoadInst *Load); + bool PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); @@ -343,7 +348,8 @@ /// AvailableLoads (connected by Phis if needed). void eliminatePartiallyRedundantLoad( LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, - MapVector &AvailableLoads); + MapVector &AvailableLoads, + MapVector *CriticalEdgePredAndLoad); // Other helper routines bool processInstruction(Instruction *I); Index: llvm/lib/Transforms/Scalar/GVN.cpp =================================================================== --- llvm/lib/Transforms/Scalar/GVN.cpp +++ llvm/lib/Transforms/Scalar/GVN.cpp @@ -920,6 +920,19 @@ return !UnavailableBB; } +/// If the specified BB exists in ValuesPerBlock, replace its value with +/// NewValue. +static void ReplaceValuesPerBlockEntry( + SmallVectorImpl &ValuesPerBlock, BasicBlock *BB, + Value *NewValue) { + for (AvailableValueInBlock &V : ValuesPerBlock) { + if (V.BB == BB) { + V = AvailableValueInBlock::get(BB, NewValue); + return; + } + } +} + /// Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate Load. This returns the value /// that should be used at Load's definition site. @@ -1345,9 +1358,56 @@ "post condition violation"); } +/// Given the following code, v1 is partially available on some edges, but not +/// available on the edge from PredBB. This function tries to find if there is +/// another identical load in the other successor of PredBB. +/// +/// v0 = load %addr +/// br %LoadBB +/// +/// LoadBB: +/// v1 = load %addr +/// ... +/// +/// PredBB: +/// ... +/// br %cond, label %LoadBB, label %SuccBB +/// +/// SuccBB: +/// v2 = load %addr +/// ... +/// +LoadInst *GVNPass::findLoadToHoistIntoPred(BasicBlock *Pred, BasicBlock *LoadBB, + LoadInst *Load) { + // For simplicity we handle a Pred has 2 successors only. + auto *Term = Pred->getTerminator(); + if (Term->getNumSuccessors() != 2) + return nullptr; + auto *SuccBB = Term->getSuccessor(0); + if (SuccBB == LoadBB) + SuccBB = Term->getSuccessor(1); + if (!SuccBB->getSinglePredecessor()) + return nullptr; + + for (Instruction &Inst : *SuccBB) { + if (Inst.isIdenticalTo(Load)) { + MemDepResult Dep = MD->getDependency(&Inst); + // If an identical load doesn't depends on any local instructions, it can + // be safely moved to PredBB. + if (Dep.isNonLocal()) + return static_cast(&Inst); + + return nullptr; + } + } + + return nullptr; +} + void GVNPass::eliminatePartiallyRedundantLoad( LoadInst *Load, AvailValInBlkVect &ValuesPerBlock, - MapVector &AvailableLoads) { + MapVector &AvailableLoads, + MapVector *CriticalEdgePredAndLoad) { for (const auto &AvailableLoad : AvailableLoads) { BasicBlock *UnavailableBlock = AvailableLoad.first; Value *LoadPtr = AvailableLoad.second; @@ -1401,6 +1461,19 @@ AvailableValueInBlock::get(UnavailableBlock, NewLoad)); MD->invalidateCachedPointerInfo(LoadPtr); LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); + + // For PredBB in CriticalEdgePredAndLoad we need to delete the already found + // load instruction which is now redundant. + if (CriticalEdgePredAndLoad) { + auto I = CriticalEdgePredAndLoad->find(UnavailableBlock); + if (I != CriticalEdgePredAndLoad->end()) { + LoadInst *OldLoad = I->second; + OldLoad->replaceAllUsesWith(NewLoad); + ReplaceValuesPerBlockEntry(ValuesPerBlock, OldLoad->getParent(), + NewLoad); + markInstructionForDeletion(OldLoad); + } + } } // Perform PHI construction. @@ -1487,7 +1560,12 @@ for (BasicBlock *UnavailableBB : UnavailableBlocks) FullyAvailableBlocks[UnavailableBB] = AvailabilityState::Unavailable; - SmallVector CriticalEdgePred; + // The edge from Pred to LoadBB is a critical edge will be splitted. + SmallVector CriticalEdgePredSplit; + // The edge from Pred to LoadBB is a critical edge, another successor of Pred + // contains a load can be moved to Pred. This data structure maps the Pred to + // the movable load. + MapVector CriticalEdgePredAndLoad; for (BasicBlock *Pred : predecessors(LoadBB)) { // If any predecessor block is an EH pad that does not allow non-PHI // instructions before the terminator, we can't PRE the load. @@ -1527,39 +1605,53 @@ return false; } - CriticalEdgePred.push_back(Pred); + if (LoadInst *LI = findLoadToHoistIntoPred(Pred, LoadBB, Load)) { + CriticalEdgePredAndLoad[Pred] = LI; + } else + CriticalEdgePredSplit.push_back(Pred); } else { // Only add the predecessors that will not be split for now. PredLoads[Pred] = nullptr; } + + // Early check for non profitable PRE load. + unsigned NumInsertPreds = PredLoads.size() + CriticalEdgePredSplit.size(); + if (NumInsertPreds > 1) + return false; } // Decide whether PRE is profitable for this load. - unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size(); + unsigned NumInsertPreds = PredLoads.size() + CriticalEdgePredSplit.size(); + unsigned NumUnavailablePreds = NumInsertPreds + + CriticalEdgePredAndLoad.size(); assert(NumUnavailablePreds != 0 && "Fully available value should already be eliminated!"); - // If this load is unavailable in multiple predecessors, reject it. + // If we need to insert new load in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with // all the preds that don't have an available Load and insert a new load into // that one block. - if (NumUnavailablePreds != 1) + if (NumInsertPreds > 1) return false; // Now we know where we will insert load. We must ensure that it is safe // to speculatively execute the load at that points. if (MustEnsureSafetyOfSpeculativeExecution) { - if (CriticalEdgePred.size()) + if (CriticalEdgePredSplit.size()) if (!isSafeToSpeculativelyExecute(Load, LoadBB->getFirstNonPHI(), AC, DT)) return false; for (auto &PL : PredLoads) if (!isSafeToSpeculativelyExecute(Load, PL.first->getTerminator(), AC, DT)) return false; + for (auto &CEP : CriticalEdgePredAndLoad) + if (!isSafeToSpeculativelyExecute(Load, CEP.first->getTerminator(), AC, + DT)) + return false; } // Split critical edges, and update the unavailable predecessors accordingly. - for (BasicBlock *OrigPred : CriticalEdgePred) { + for (BasicBlock *OrigPred : CriticalEdgePredSplit) { BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!"); PredLoads[NewPred] = nullptr; @@ -1567,6 +1659,9 @@ << LoadBB->getName() << '\n'); } + for (auto &CEP : CriticalEdgePredAndLoad) + PredLoads[CEP.first] = nullptr; + // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; const DataLayout &DL = Load->getModule()->getDataLayout(); @@ -1623,7 +1718,7 @@ } // HINT: Don't revert the edge-splitting as following transformation may // also need to split these critical edges. - return !CriticalEdgePred.empty(); + return !CriticalEdgePredSplit.empty(); } // Okay, we can eliminate this load by inserting a reload in the predecessor @@ -1648,7 +1743,8 @@ VN.lookupOrAdd(I); } - eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, PredLoads); + eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, PredLoads, + &CriticalEdgePredAndLoad); ++NumPRELoad; return true; } @@ -1727,7 +1823,8 @@ AvailableLoads[Preheader] = LoadPtr; LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOOP LOAD: " << *Load << '\n'); - eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, AvailableLoads); + eliminatePartiallyRedundantLoad(Load, ValuesPerBlock, AvailableLoads, + nullptr); ++NumPRELoopLoad; return true; } @@ -2699,7 +2796,6 @@ --BI; for (auto *I : InstrsToErase) { - assert(I->getParent() == BB && "Removing instruction from wrong block?"); LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n'); salvageKnowledge(I, AC); salvageDebugInfo(*I); Index: llvm/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll =================================================================== --- llvm/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll +++ llvm/test/Transforms/GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: br label %bb15 ; CHECK-LABEL: bb15: -; CHECK: %tmp17 = phi i8 [ %tmp8, %bb15split ], [ %tmp17.pre, %bb1.bb15_crit_edge ] +; CHECK: %tmp17 = phi i8 [ %tmp12.pre3, %bb15split ], [ %tmp17.pre, %bb1.bb15_crit_edge ] bb19: ; preds = %bb15 ret i1 %tmp18 Index: llvm/test/Transforms/GVN/PRE/pre-load.ll =================================================================== --- llvm/test/Transforms/GVN/PRE/pre-load.ll +++ llvm/test/Transforms/GVN/PRE/pre-load.ll @@ -687,18 +687,14 @@ ; CHECK-LABEL: @test15( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[A:%.*]], 0 -; CHECK-NEXT: br i1 [[TOBOOL]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]] -; CHECK: entry.if.end_crit_edge: ; CHECK-NEXT: [[VV_PRE:%.*]] = load i32, i32* [[X:%.*]], align 4 -; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[UU:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-NEXT: store i32 [[UU]], i32* [[R:%.*]], align 4 +; CHECK-NEXT: store i32 [[VV_PRE]], i32* [[R:%.*]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ] ; CHECK-NEXT: call void @f() -; CHECK-NEXT: ret i32 [[VV]] +; CHECK-NEXT: ret i32 [[VV_PRE]] ; entry: @@ -728,18 +724,14 @@ ; CHECK-LABEL: @test16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[A:%.*]], 0 -; CHECK-NEXT: br i1 [[TOBOOL]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]] -; CHECK: entry.if.end_crit_edge: ; CHECK-NEXT: [[VV_PRE:%.*]] = load i32, i32* [[X:%.*]], align 4 -; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[UU:%.*]] = load i32, i32* [[X]], align 4 -; CHECK-NEXT: store i32 [[UU]], i32* [[R:%.*]], align 4 +; CHECK-NEXT: store i32 [[VV_PRE]], i32* [[R:%.*]], align 4 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ] ; CHECK-NEXT: call void @f() -; CHECK-NEXT: ret i32 [[VV]] +; CHECK-NEXT: ret i32 [[VV_PRE]] ; entry: @@ -765,3 +757,81 @@ %vv = load i32, i32* %x, align 4 ret i32 %vv } + +declare i1 @foo() +declare i1 @bar() + +; %v3 is partially redundant, bb3 has multiple predecessors coming through +; critical edges. The other successors of those predecessors have same loads. +; We can move all loads into predecessors. + +define void @test17(i64* %p1, i64* %p2, i64* %p3, i64* %p4) +; CHECK-LABEL: @test17( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[V1:%.*]] = load i64, i64* [[P1:%.*]], align 8 +; CHECK-NEXT: [[COND1:%.*]] = icmp sgt i64 [[V1]], 200 +; CHECK-NEXT: br i1 [[COND1]], label [[BB200:%.*]], label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[COND2:%.*]] = icmp sgt i64 [[V1]], 100 +; CHECK-NEXT: br i1 [[COND2]], label [[BB100:%.*]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[V2:%.*]] = add nsw i64 [[V1]], 1 +; CHECK-NEXT: store i64 [[V2]], i64* [[P1]], align 8 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb3: +; CHECK-NEXT: [[V3:%.*]] = phi i64 [ [[V3_PRE:%.*]], [[BB200]] ], [ [[V3_PRE1:%.*]], [[BB100]] ], [ [[V2]], [[BB2]] ] +; CHECK-NEXT: store i64 [[V3]], i64* [[P2:%.*]], align 8 +; CHECK-NEXT: ret void +; CHECK: bb100: +; CHECK-NEXT: [[COND3:%.*]] = call i1 @foo() +; CHECK-NEXT: [[V3_PRE1]] = load i64, i64* [[P1]], align 8 +; CHECK-NEXT: br i1 [[COND3]], label [[BB3]], label [[BB101:%.*]] +; CHECK: bb101: +; CHECK-NEXT: store i64 [[V3_PRE1]], i64* [[P3:%.*]], align 8 +; CHECK-NEXT: ret void +; CHECK: bb200: +; CHECK-NEXT: [[COND4:%.*]] = call i1 @bar() +; CHECK-NEXT: [[V3_PRE]] = load i64, i64* [[P1]], align 8 +; CHECK-NEXT: br i1 [[COND4]], label [[BB3]], label [[BB201:%.*]] +; CHECK: bb201: +; CHECK-NEXT: store i64 [[V3_PRE]], i64* [[P4:%.*]], align 8 +; CHECK-NEXT: ret void +; +{ +entry: + %v1 = load i64, i64* %p1, align 8 + %cond1 = icmp sgt i64 %v1, 200 + br i1 %cond1, label %bb200, label %bb1 + +bb1: + %cond2 = icmp sgt i64 %v1, 100 + br i1 %cond2, label %bb100, label %bb2 + +bb2: + %v2 = add nsw i64 %v1, 1 + store i64 %v2, i64* %p1, align 8 + br label %bb3 + +bb3: + %v3 = load i64, i64* %p1, align 8 + store i64 %v3, i64* %p2, align 8 + ret void + +bb100: + %cond3 = call i1 @foo() + br i1 %cond3, label %bb3, label %bb101 + +bb101: + %v4 = load i64, i64* %p1, align 8 + store i64 %v4, i64* %p3, align 8 + ret void + +bb200: + %cond4 = call i1 @bar() + br i1 %cond4, label %bb3, label %bb201 + +bb201: + %v5 = load i64, i64* %p1, align 8 + store i64 %v5, i64* %p4, align 8 + ret void +} Index: llvm/test/Transforms/GVN/PRE/volatile.ll =================================================================== --- llvm/test/Transforms/GVN/PRE/volatile.ll +++ llvm/test/Transforms/GVN/PRE/volatile.ll @@ -122,18 +122,14 @@ define i32 @test7(i1 %c, i32* noalias nocapture %p, i32* noalias nocapture %q) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C:%.*]], label [[ENTRY_HEADER_CRIT_EDGE:%.*]], label [[SKIP:%.*]] -; CHECK: entry.header_crit_edge: ; CHECK-NEXT: [[Y_PRE:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[HEADER:%.*]], label [[SKIP:%.*]] ; CHECK: skip: -; CHECK-NEXT: [[Y1:%.*]] = load i32, i32* [[P]], align 4 -; CHECK-NEXT: call void @use(i32 [[Y1]]) +; CHECK-NEXT: call void @use(i32 [[Y_PRE]]) ; CHECK-NEXT: br label [[HEADER]] ; CHECK: header: -; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[Y_PRE]], [[ENTRY_HEADER_CRIT_EDGE]] ], [ [[Y]], [[HEADER]] ], [ [[Y1]], [[SKIP]] ] ; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[Q:%.*]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y_PRE]], [[X]] ; CHECK-NEXT: [[CND:%.*]] = icmp eq i32 [[ADD]], 0 ; CHECK-NEXT: br i1 [[CND]], label [[EXIT:%.*]], label [[HEADER]] ; CHECK: exit: Index: llvm/test/Transforms/GVN/condprop.ll =================================================================== --- llvm/test/Transforms/GVN/condprop.ll +++ llvm/test/Transforms/GVN/condprop.ll @@ -521,19 +521,15 @@ ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[PTR2:%.*]], i32 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[PTR2]], i32 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR1:%.*]], [[PTR2]] -; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[ENTRY_END_CRIT_EDGE:%.*]] -; CHECK: entry.end_crit_edge: ; CHECK-NEXT: [[VAL2_PRE:%.*]] = load i32, i32* [[GEP2]], align 4 -; CHECK-NEXT: br label [[END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: -; CHECK-NEXT: [[VAL1:%.*]] = load i32, i32* [[GEP2]], align 4 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[VAL2:%.*]] = phi i32 [ [[VAL1]], [[IF]] ], [ [[VAL2_PRE]], [[ENTRY_END_CRIT_EDGE]] ] -; CHECK-NEXT: [[PHI1:%.*]] = phi i32* [ [[PTR2]], [[IF]] ], [ [[GEP1]], [[ENTRY_END_CRIT_EDGE]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[VAL1]], [[IF]] ], [ 0, [[ENTRY_END_CRIT_EDGE]] ] +; CHECK-NEXT: [[PHI1:%.*]] = phi i32* [ [[PTR2]], [[IF]] ], [ [[GEP1]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[VAL2_PRE]], [[IF]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: store i32 0, i32* [[PHI1]], align 4 -; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI2]], [[VAL2]] +; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI2]], [[VAL2_PRE]] ; CHECK-NEXT: ret i32 [[RET]] ; entry: