Index: lib/Transforms/IPO/GlobalOpt.cpp =================================================================== --- lib/Transforms/IPO/GlobalOpt.cpp +++ lib/Transforms/IPO/GlobalOpt.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -69,6 +70,7 @@ struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { @@ -85,7 +87,9 @@ bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); - + + bool isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV); + TargetLibraryInfo *TLI; SmallSet NotDiscardableComdats; }; @@ -95,6 +99,7 @@ INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) @@ -1718,15 +1723,78 @@ return ProcessInternalGlobal(GV, GVI, GS); } +bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) { + // Find all uses of GV. We expect them all to be in F, and if we can't + // identify any of the uses we bail out. + // + // On each of these uses, identify if the memory that GV points to is + // used/required/live at the start of the function. If it is not, for example + // if the first thing the function does is store to the GV, the GV can + // possibly be demoted. + // + // We don't do an exhaustive search for memory operations - simply look + // through bitcasts as they're quite common and benign. + const DataLayout &DL = GV->getParent()->getDataLayout(); + SmallVector Loads; + SmallVector Stores; + for (auto *U : GV->users()) { + if (Operator::getOpcode(U) == Instruction::BitCast) { + for (auto *UU : U->users()) { + if (auto *LI = dyn_cast(UU)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast(UU)) + Stores.push_back(SI); + else + return false; + } + continue; + } + + Instruction *I = dyn_cast(U); + if (!I) + return false; + assert(I->getParent()->getParent() == F); + + if (auto *LI = dyn_cast(I)) + Loads.push_back(LI); + else if (auto *SI = dyn_cast(I)) + Stores.push_back(SI); + else + return false; + } + + // We have identified all uses of GV into loads and stores. Now check if all + // of them are known not to depend on the value of the global at the function + // entry point. We do this by ensuring that every load is dominated by at + // least one store. + auto &DT = getAnalysis(*const_cast(F)) + .getDomTree(); + + for (auto *L : Loads) { + auto *LTy = L->getType(); + if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) { + auto *STy = S->getValueOperand()->getType(); + // The load is only dominated by the store if DomTree says so + // and the number of bits loaded in L is less than or equal to + // the number of bits stored in S. + return DT.dominates(S, L) && + DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy); + })) + return false; + } + // All loads have known dependences inside F, so the global can be localized. + return true; +} + /// Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, Module::global_iterator &GVI, const GlobalStatus &GS) { auto &DL = GV->getParent()->getDataLayout(); - // If this is a first class global and has only one accessing function - // and this function is main (which we know is not recursive), we replace - // the global with a local alloca in this function. + // If this is a first class global and has only one accessing function and + // this function is non-recursive, we replace the global with a local alloca + // in this function. // // NOTE: It doesn't make sense to promote non-single-value types since we // are just replacing static memory to stack memory. @@ -1735,9 +1803,10 @@ if (!GS.HasMultipleAccessingFunctions && GS.AccessingFunction && !GS.HasNonInstructionUser && GV->getType()->getElementType()->isSingleValueType() && - GS.AccessingFunction->getName() == "main" && - GS.AccessingFunction->hasExternalLinkage() && - GV->getType()->getAddressSpace() == 0) { + GV->getType()->getAddressSpace() == 0 && + !GV->isExternallyInitialized() && + GS.AccessingFunction->doesNotRecurse() && + isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) { DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast(*GS.AccessingFunction ->getEntryBlock().begin()); Index: test/Transforms/GlobalOpt/global-demotion.ll =================================================================== --- /dev/null +++ test/Transforms/GlobalOpt/global-demotion.ll @@ -0,0 +1,80 @@ +; RUN: opt -globalopt -S < %s | FileCheck %s + +@G1 = internal global i32 5 +@G2 = internal global i32 5 +@G3 = internal global i32 5 +@G4 = internal global i32 5 +@G5 = internal global i32 5 + +; CHECK-LABEL: @test1 +define internal i32 @test1() norecurse { +; CHECK-NOT: @G1 + store i32 4, i32* @G1 + %a = load i32, i32* @G1 +; CHECK: ret + ret i32 %a +} + +; The load comes before the store which makes @G2 live before the call. +; CHECK-LABEL: @test2 +define internal i32 @test2() norecurse { +; CHECK-NOT: %G2 + %a = load i32, i32* @G2 + store i32 4, i32* @G2 +; CHECK: ret + ret i32 %a +} + +; This global is indexed by a GEP - this makes it partial alias and we bail out. +; FIXME: We don't actually have to bail out in this case. + +; CHECK-LABEL: @test3 +define internal i32 @test3() norecurse { +; CHECK-NOT: %G3 + %x = getelementptr i32,i32* @G3, i32 0 + %a = load i32, i32* %x + store i32 4, i32* @G3 +; CHECK: ret + ret i32 %a +} + +; The global is casted away to a larger type then loaded. The store only partially +; covers the load, so we must not demote. + +; CHECK-LABEL: @test4 +define internal i32 @test4() norecurse { +; CHECK-NOT: %G4 + store i32 4, i32* @G4 + %x = bitcast i32* @G4 to i64* + %a = load i64, i64* %x + %b = trunc i64 %a to i32 +; CHECK: ret + ret i32 %b +} + +; The global is casted away to a smaller type then loaded. This one is fine. + +; CHECK-LABEL: @test5 +define internal i32 @test5() norecurse { +; CHECK-NOT: @G5 + store i32 4, i32* @G5 + %x = bitcast i32* @G5 to i16* + %a = load i16, i16* %x + %b = zext i16 %a to i32 +; CHECK: ret + ret i32 %b +} + +define i32 @main() norecurse { + %a = call i32 @test1() + %b = call i32 @test2() + %c = call i32 @test3() + %d = call i32 @test4() + %e = call i32 @test5() + + %x = or i32 %a, %b + %y = or i32 %x, %c + %z = or i32 %y, %d + %w = or i32 %z, %e + ret i32 %w +} Index: test/Transforms/GlobalOpt/metadata.ll =================================================================== --- test/Transforms/GlobalOpt/metadata.ll +++ test/Transforms/GlobalOpt/metadata.ll @@ -5,7 +5,7 @@ ; to that containing %G should likewise drop to null. @G = internal global i8** null -define i32 @main(i32 %argc, i8** %argv) { +define i32 @main(i32 %argc, i8** %argv) norecurse { ; CHECK-LABEL: @main( ; CHECK: %G = alloca store i8** %argv, i8*** @G Index: test/Transforms/MergeFunc/crash2.ll =================================================================== --- test/Transforms/MergeFunc/crash2.ll +++ test/Transforms/MergeFunc/crash2.ll @@ -11,7 +11,7 @@ @G = internal global i8** null @G2 = internal global i8** null -define i32 @main(i32 %argc, i8** %argv) { +define i32 @main(i32 %argc, i8** %argv) norecurse { ; CHECK: alloca store i8** %argv, i8*** @G ret i32 0