Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -259,6 +259,10 @@ OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(false), cl::desc("Enable converting phi types in CodeGenPrepare")); +static cl::opt + HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(0), cl::Hidden, + cl::desc("Least BB number of huge function.")); + namespace { enum ExtType { @@ -270,6 +274,15 @@ // information of a promoted instruction invalid. }; +enum ModifyDT { + NotModifyDT, // Not Modify any DT. + ModifyBBDT, // Modify the Basic Block Dominator Tree. + ModifyInstDT // Modify the Instruction Dominator in a Basic Block, + // This usually means we move/delete/insert instruction + // in a Basic Block. So we should re-iterate instructions + // in such Basic Block. +}; + using SetOfInstrs = SmallPtrSet; using TypeIsSExt = PointerIntPair; using InstrToOrigTy = DenseMap; @@ -342,6 +355,13 @@ std::unique_ptr DT; public: + /// If encounter huge function, we need to limit the build time. + bool IsHugeFunc = false; + + /// FreshBBs is like worklist, it collected the updated BBs which need + /// to be optimized again. + SmallSet FreshBBs; + static char ID; // Pass identification, replacement for typeid CodeGenPrepare() : FunctionPass(ID) { @@ -398,13 +418,13 @@ bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB, bool isPreheader); bool makeBitReverse(Instruction &I); - bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); - bool optimizeInst(Instruction *I, bool &ModifiedDT); + bool optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT); + bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT); bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); bool optimizeInlineAsmInst(CallInst *CS); - bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); + bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT); bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *Load); @@ -416,7 +436,7 @@ bool optimizeSwitchPhiConstants(SwitchInst *SI); bool optimizeSwitchInst(SwitchInst *SI); bool optimizeExtractElementInst(Instruction *Inst); - bool dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT); + bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT); bool fixupDbgValue(Instruction *I); bool placeDbgValues(Function &F); bool placePseudoProbes(Function &F); @@ -435,15 +455,15 @@ Instruction *&Inst, bool AllowPromotionWithoutCommonHeader, bool HasPromoted, TypePromotionTransaction &TPT, SmallVectorImpl &SpeculativelyMovedExts); - bool splitBranchCondition(Function &F, bool &ModifiedDT); + bool splitBranchCondition(Function &F, ModifyDT &ModifiedDT); bool simplifyOffsetableRelocate(GCStatepointInst &I); bool tryToSinkFreeOperands(Instruction *I); bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1, CmpInst *Cmp, Intrinsic::ID IID); - bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT); - bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT); - bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT); + bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT); + bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); + bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT); void verifyBFIUpdates(Function &F); }; @@ -474,6 +494,7 @@ // Clear per function information. InsertedInsts.clear(); PromotedInsts.clear(); + FreshBBs.clear(); TM = &getAnalysis().getTM(); SubtargetInfo = TM->getSubtargetImpl(F); @@ -537,7 +558,7 @@ // unconditional branch. EverMadeChange |= eliminateMostlyEmptyBlocks(F); - bool ModifiedDT = false; + ModifyDT ModifiedDT = ModifyDT::NotModifyDT; if (!DisableBranchOpts) EverMadeChange |= splitBranchCondition(F, ModifiedDT); @@ -546,18 +567,51 @@ EverMadeChange |= SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true); + // If we are optimzing huge function, we need to consider the build time. + // Because the basic algorithm's complex is near O(N!). + IsHugeFunc = F.size() > HugeFuncThresholdInCGPP; + bool MadeChange = true; + bool FuncIterated = false; while (MadeChange) { MadeChange = false; DT.reset(); + for (BasicBlock &BB : llvm::make_early_inc_range(F)) { - bool ModifiedDTOnIteration = false; - MadeChange |= optimizeBlock(BB, ModifiedDTOnIteration); + if (FuncIterated && !FreshBBs.contains(&BB)) + continue; - // Restart BB iteration if the dominator tree of the Function was changed - if (ModifiedDTOnIteration) - break; + ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT; + bool Changed = optimizeBlock(BB, ModifiedDTOnIteration); + + MadeChange |= Changed; + if (IsHugeFunc) { + // If the BB is updated, it may still has chance to be optimized. + // This usually happen at sink optimization. + // For example: + // + // bb0: + // %and = and i32 %a, 4 + // %cmp = icmp eq i32 %and, 0 + // + // If the %cmp sink to other BB, the %and will has chance to sink. + if (Changed) + FreshBBs.insert(&BB); + else if (FuncIterated) + FreshBBs.erase(&BB); + + if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT) + DT.reset(); + } else { + // For small/normal functions, we restart BB iteration if the dominator + // tree of the Function was changed. + if (ModifiedDTOnIteration != ModifyDT::NotModifyDT) + break; + } } + // We have iterated all the BB in the (only work for huge) function. + FuncIterated = IsHugeFunc; + if (EnableTypePromotionMerge && !ValToSExtendedUses.empty()) MadeChange |= mergeSExts(F); if (!LargeOffsetGEPMap.empty()) @@ -728,6 +782,12 @@ // Merge BB into SinglePred and delete it. MergeBlockIntoPredecessor(BB); Preds.insert(SinglePred); + + if (IsHugeFunc) { + // Update FreshBBs to optimize the merged BB. + FreshBBs.insert(SinglePred); + FreshBBs.erase(BB); + } } } @@ -962,6 +1022,22 @@ return true; } +/// Replace all old uses with new ones, and push the updated BBs into FreshBBs. +static void replaceAllUsesWith(Value *Old, Value *New, + SmallSet &FreshBBs, + bool IsHuge) { + auto *OldI = dyn_cast(Old); + if (OldI) { + for (Value::user_iterator UI = OldI->user_begin(), E = OldI->user_end(); + UI != E; ++UI) { + Instruction *User = cast(*UI); + if (IsHuge) + FreshBBs.insert(User->getParent()); + } + } + Old->replaceAllUsesWith(New); +} + /// Eliminate a basic block that has only phi's and an unconditional branch in /// it. void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) { @@ -982,6 +1058,12 @@ // Note: BB(=SinglePred) will not be deleted on this path. // DestBB(=its single successor) is the one that was deleted. LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n"); + + if (IsHugeFunc) { + // Update FreshBBs to optimize the merged BB. + FreshBBs.insert(SinglePred); + FreshBBs.erase(DestBB); + } return; } } @@ -1449,12 +1531,12 @@ Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1); if (BO->getOpcode() != Instruction::Xor) { Value *Math = Builder.CreateExtractValue(MathOV, 0, "math"); - BO->replaceAllUsesWith(Math); + replaceAllUsesWith(BO, Math, FreshBBs, IsHugeFunc); } else assert(BO->hasOneUse() && "Patterns with XOr should use the BO only in the compare"); Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov"); - Cmp->replaceAllUsesWith(OV); + replaceAllUsesWith(Cmp, OV, FreshBBs, IsHugeFunc); Cmp->eraseFromParent(); BO->eraseFromParent(); return true; @@ -1492,7 +1574,8 @@ /// Try to combine the compare into a call to the llvm.uadd.with.overflow /// intrinsic. Return true if any changes were made. -bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT) { +bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, + ModifyDT &ModifiedDT) { Value *A, *B; BinaryOperator *Add; if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) { @@ -1519,11 +1602,12 @@ return false; // Reset callers - do not crash by iterating over a dead instruction. - ModifiedDT = true; + ModifiedDT = ModifyDT::ModifyInstDT; return true; } -bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT) { +bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, + ModifyDT &ModifiedDT) { // We are not expecting non-canonical/degenerate code. Just bail out. Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1); if (isa(A) && isa(B)) @@ -1581,7 +1665,7 @@ return false; // Reset callers - do not crash by iterating over a dead instruction. - ModifiedDT = true; + ModifiedDT = ModifyDT::ModifyInstDT; return true; } @@ -1738,7 +1822,7 @@ return true; } -bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, bool &ModifiedDT) { +bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; @@ -2043,7 +2127,9 @@ /// If the transform is performed, return true and set ModifiedDT to true. static bool despeculateCountZeros(IntrinsicInst *CountZeros, const TargetLowering *TLI, - const DataLayout *DL, bool &ModifiedDT) { + const DataLayout *DL, ModifyDT &ModifiedDT, + SmallSet &FreshBBs, + bool IsHugeFunc) { // If a zero input is undefined, it doesn't make sense to despeculate that. if (match(CountZeros->getOperand(1), m_One())) return false; @@ -2068,12 +2154,16 @@ // The intrinsic will be sunk behind a compare against zero and branch. BasicBlock *StartBlock = CountZeros->getParent(); BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false"); + if (IsHugeFunc) + FreshBBs.insert(CallBlock); // Create another block after the count zero intrinsic. A PHI will be added // in this block to select the result of the intrinsic or the bit-width // constant if the input to the intrinsic is zero. BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros)); BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end"); + if (IsHugeFunc) + FreshBBs.insert(EndBlock); // Set up a builder to create a compare, conditional branch, and PHI. IRBuilder<> Builder(CountZeros->getContext()); @@ -2094,7 +2184,7 @@ // or the bit width of the operand. Builder.SetInsertPoint(&EndBlock->front()); PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz"); - CountZeros->replaceAllUsesWith(PN); + replaceAllUsesWith(CountZeros, PN, FreshBBs, IsHugeFunc); Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits)); PN->addIncoming(BitWidth, StartBlock); PN->addIncoming(CountZeros, CallBlock); @@ -2103,11 +2193,11 @@ // undefined zero argument to 'true'. This will also prevent reprocessing the // intrinsic; we only despeculate when a zero input is defined. CountZeros->setArgOperand(1, Builder.getTrue()); - ModifiedDT = true; + ModifiedDT = ModifyDT::ModifyBBDT; return true; } -bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) { +bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -2241,14 +2331,15 @@ LargeOffsetGEPMap.erase(II); } - II->replaceAllUsesWith(ArgVal); + replaceAllUsesWith(II, ArgVal, FreshBBs, IsHugeFunc); II->eraseFromParent(); return true; } case Intrinsic::cttz: case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. - return despeculateCountZeros(II, TLI, DL, ModifiedDT); + return despeculateCountZeros(II, TLI, DL, ModifiedDT, FreshBBs, + IsHugeFunc); case Intrinsic::fshl: case Intrinsic::fshr: return optimizeFunnelShift(II); @@ -2265,7 +2356,8 @@ auto *One = ConstantInt::getSigned(II->getType(), 1); auto *CGep = ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One); - II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType())); + replaceAllUsesWith(II, ConstantExpr::getPtrToInt(CGep, II->getType()), + FreshBBs, IsHugeFunc); II->eraseFromParent(); return true; } @@ -2299,7 +2391,7 @@ FortifiedLibCallSimplifier Simplifier(TLInfo, true); IRBuilder<> Builder(CI); if (Value *V = Simplifier.optimizeCall(CI, Builder)) { - CI->replaceAllUsesWith(V); + replaceAllUsesWith(CI, V, FreshBBs, IsHugeFunc); CI->eraseFromParent(); return true; } @@ -2338,7 +2430,10 @@ /// ret i32 %tmp2 /// @endcode bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, - bool &ModifiedDT) { + ModifyDT &ModifiedDT) { + if (!BB->getTerminator()) + return false; + ReturnInst *RetI = dyn_cast(BB->getTerminator()); if (!RetI) return false; @@ -2432,7 +2527,8 @@ BFI->setBlockFreq( BB, (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency()); - ModifiedDT = Changed = true; + ModifiedDT = ModifyDT::ModifyBBDT; + Changed = true; ++NumRetsDup; } @@ -5834,7 +5930,7 @@ bool inserted = false; for (auto &Pt : CurPts) { if (getDT(F).dominates(Inst, Pt)) { - Pt->replaceAllUsesWith(Inst); + replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc); RemovedInsts.insert(Pt); Pt->removeFromParent(); Pt = Inst; @@ -5846,7 +5942,7 @@ // Give up if we need to merge in a common dominator as the // experiments show it is not profitable. continue; - Inst->replaceAllUsesWith(Pt); + replaceAllUsesWith(Inst, Pt, FreshBBs, IsHugeFunc); RemovedInsts.insert(Inst); Inst->removeFromParent(); inserted = true; @@ -5998,7 +6094,7 @@ if (GEP->getType() != I8PtrTy) NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); } - GEP->replaceAllUsesWith(NewGEP); + replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc); LargeOffsetGEPID.erase(GEP); LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP); GEP->eraseFromParent(); @@ -6135,7 +6231,7 @@ for (Instruction *U : Uses) { if (isa(U)) { DeletedInstrs.insert(U); - U->replaceAllUsesWith(ValMap[U->getOperand(0)]); + replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc); } else { U->setOperand(0, new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U)); @@ -6163,7 +6259,7 @@ // Remove any old phi's that have been converted. for (auto *I : DeletedInstrs) { - I->replaceAllUsesWith(PoisonValue::get(I->getType())); + replaceAllUsesWith(I, PoisonValue::get(I->getType()), FreshBBs, IsHugeFunc); I->eraseFromParent(); } @@ -6578,7 +6674,7 @@ // Replace all uses of load with new and (except for the use of load in the // new and itself). - Load->replaceAllUsesWith(NewAnd); + replaceAllUsesWith(Load, NewAnd, FreshBBs, IsHugeFunc); NewAnd->setOperand(0, Load); // Remove any and instructions that are now redundant. @@ -6586,7 +6682,7 @@ // Check that the and mask is the same as the one we decided to put on the // new and. if (cast(And->getOperand(1))->getValue() == DemandBits) { - And->replaceAllUsesWith(NewAnd); + replaceAllUsesWith(And, NewAnd, FreshBBs, IsHugeFunc); if (&*CurInstIterator == And) CurInstIterator = std::next(And->getIterator()); And->eraseFromParent(); @@ -6697,7 +6793,7 @@ Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal); Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal); Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); - Shift->replaceAllUsesWith(NewSel); + replaceAllUsesWith(Shift, NewSel, FreshBBs, IsHugeFunc); Shift->eraseFromParent(); return true; } @@ -6732,7 +6828,7 @@ Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, TVal}); Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, FVal}); Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal); - Fsh->replaceAllUsesWith(NewSel); + replaceAllUsesWith(Fsh, NewSel, FreshBBs, IsHugeFunc); Fsh->eraseFromParent(); return true; } @@ -6819,6 +6915,8 @@ BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + if (IsHugeFunc) + FreshBBs.insert(EndBlock); BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); // Delete the unconditional branch that was just created by the split. @@ -6839,6 +6937,8 @@ TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", EndBlock->getParent(), EndBlock); TrueBranch = BranchInst::Create(EndBlock, TrueBlock); + if (IsHugeFunc) + FreshBBs.insert(TrueBlock); TrueBranch->setDebugLoc(SI->getDebugLoc()); } auto *TrueInst = cast(SI->getTrueValue()); @@ -6848,6 +6948,8 @@ if (FalseBlock == nullptr) { FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", EndBlock->getParent(), EndBlock); + if (IsHugeFunc) + FreshBBs.insert(FalseBlock); FalseBranch = BranchInst::Create(EndBlock, FalseBlock); FalseBranch->setDebugLoc(SI->getDebugLoc()); } @@ -6864,6 +6966,8 @@ FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", EndBlock->getParent(), EndBlock); + if (IsHugeFunc) + FreshBBs.insert(FalseBlock); auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); FalseBranch->setDebugLoc(SI->getDebugLoc()); } @@ -6903,7 +7007,7 @@ PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock); PN->setDebugLoc(SI->getDebugLoc()); - SI->replaceAllUsesWith(PN); + replaceAllUsesWith(SI, PN, FreshBBs, IsHugeFunc); SI->eraseFromParent(); INS.erase(SI); ++NumSelectsExpanded; @@ -6941,7 +7045,7 @@ Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1); Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType); - SVI->replaceAllUsesWith(BC2); + replaceAllUsesWith(SVI, BC2, FreshBBs, IsHugeFunc); RecursivelyDeleteTriviallyDeadInstructions( SVI, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); }); @@ -6994,6 +7098,18 @@ for (Use *U : ToReplace) { auto *UI = cast(U->get()); Instruction *NI = UI->clone(); + + if (IsHugeFunc) { + // Now we clone an instruction, its operands' defs may sink to this BB + // now. So we put the operands defs' BBs into FreshBBs to do optmization. + for (unsigned I = 0; I < NI->getNumOperands(); ++I) { + auto *OpDef = dyn_cast(NI->getOperand(I)); + if (!OpDef) + continue; + FreshBBs.insert(OpDef->getParent()); + } + } + NewInstructions[UI] = NI; MaybeDead.insert(UI); LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n"); @@ -7833,7 +7949,9 @@ return true; } -static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI) { +static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, + SmallSet &FreshBBs, + bool IsHugeFunc) { // Try and convert // %c = icmp ult %x, 8 // br %c, bla, blb @@ -7874,7 +7992,7 @@ ConstantInt::get(UI->getType(), 0)); LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n"); - Cmp->replaceAllUsesWith(NewCmp); + replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc); return true; } if (Cmp->isEquality() && @@ -7887,14 +8005,14 @@ ConstantInt::get(UI->getType(), 0)); LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n"); LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n"); - Cmp->replaceAllUsesWith(NewCmp); + replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc); return true; } } return false; } -bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { +bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. if (InsertedInsts.count(I)) @@ -7907,7 +8025,7 @@ // trivial PHI, go ahead and zap it here. if (Value *V = simplifyInstruction(P, {*DL, TLInfo})) { LargeOffsetGEPMap.erase(P); - P->replaceAllUsesWith(V); + replaceAllUsesWith(P, V, FreshBBs, IsHugeFunc); P->eraseFromParent(); ++NumPHIsElim; return true; @@ -7996,7 +8114,7 @@ Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), GEPI->getName(), GEPI); NC->setDebugLoc(GEPI->getDebugLoc()); - GEPI->replaceAllUsesWith(NC); + replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc); GEPI->eraseFromParent(); ++NumGEPsElim; optimizeInst(NC, ModifiedDT); @@ -8029,7 +8147,7 @@ F->takeName(FI); CmpI->setOperand(Const0 ? 1 : 0, F); } - FI->replaceAllUsesWith(CmpI); + replaceAllUsesWith(FI, CmpI, FreshBBs, IsHugeFunc); FI->eraseFromParent(); return true; } @@ -8056,7 +8174,7 @@ case Instruction::ExtractElement: return optimizeExtractElementInst(cast(I)); case Instruction::Br: - return optimizeBranch(cast(I), *TLI); + return optimizeBranch(cast(I), *TLI, FreshBBs, IsHugeFunc); } return false; @@ -8074,7 +8192,7 @@ if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts)) return false; Instruction *LastInst = Insts.back(); - I.replaceAllUsesWith(LastInst); + replaceAllUsesWith(&I, LastInst, FreshBBs, IsHugeFunc); RecursivelyDeleteTriviallyDeadInstructions( &I, TLInfo, nullptr, [&](Value *V) { removeAllAssertingVHReferences(V); }); @@ -8084,16 +8202,28 @@ // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { +bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; - CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) { - MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); - if (ModifiedDT) - return true; - } + do { + CurInstIterator = BB.begin(); + ModifiedDT = ModifyDT::NotModifyDT; + while (CurInstIterator != BB.end()) { + MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT); + if (ModifiedDT != ModifyDT::NotModifyDT) { + // For huge function we tend to quickly go though the inner optmization + // opportunities in the BB. So we go back to the BB head to re-optimize + // each instruction instead of go back to the function head. + if (IsHugeFunc) { + // DT.reset(); + break; + } else { + return true; + } + } + } + } while (ModifiedDT == ModifyDT::ModifyInstDT); bool MadeBitReverse = true; while (MadeBitReverse) { @@ -8253,7 +8383,7 @@ /// /// FIXME: Remove the (equivalent?) implementation in SelectionDAG. /// -bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) { +bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) { if (!TM->Options.EnableFastISel || TLI->isJumpExpensive()) return false; @@ -8304,6 +8434,8 @@ auto *TmpBB = BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split", BB.getParent(), BB.getNextNode()); + if (IsHugeFunc) + FreshBBs.insert(TmpBB); // Update original basic block by using the first condition directly by the // branch instruction and removing the no longer needed and/or instruction. @@ -8420,7 +8552,7 @@ } } - ModifiedDT = true; + ModifiedDT = ModifyDT::ModifyBBDT; MadeChange = true; LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump(); Index: llvm/test/CodeGen/AArch64/and-sink.ll =================================================================== --- llvm/test/CodeGen/AArch64/and-sink.ll +++ llvm/test/CodeGen/AArch64/and-sink.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s ; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s +; RUN: opt -S -codegenprepare -cgpp-huge-func=0 -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s @A = dso_local global i32 zeroinitializer @B = dso_local global i32 zeroinitializer Index: llvm/test/CodeGen/X86/and-sink.ll =================================================================== --- llvm/test/CodeGen/X86/and-sink.ll +++ llvm/test/CodeGen/X86/and-sink.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i686-unknown -verify-machineinstrs < %s | FileCheck %s ; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s +; RUN: opt < %s -codegenprepare -cgpp-huge-func=0 -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s @A = global i32 zeroinitializer @B = global i32 zeroinitializer Index: llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll +++ llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain-inseltpoison.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s +; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -cgpp-huge-func=0 -S | FileCheck -check-prefix=CHECK %s ; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead. define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) { Index: llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll +++ llvm/test/Transforms/CodeGenPrepare/ARM/sinkchain.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s +; RUN: opt -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp < %s -codegenprepare -cgpp-huge-func=0 -S | FileCheck -check-prefix=CHECK %s ; Sink the shufflevector/insertelement pair, followed by the trunc. The sunk instruction end up dead. define signext i8 @dead(i16* noalias nocapture readonly %s1, i16 zeroext %x, i8* noalias nocapture %d, i32 %n) { Index: llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll +++ llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -codegenprepare < %s | FileCheck %s +; RUN: opt -S -codegenprepare -cgpp-huge-func=0 < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"