Index: lib/Transforms/IPO/GlobalOpt.cpp =================================================================== --- lib/Transforms/IPO/GlobalOpt.cpp +++ lib/Transforms/IPO/GlobalOpt.cpp @@ -2166,8 +2166,9 @@ /// Once an evaluation call fails, the evaluation object should not be reused. class Evaluator { public: - Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) - : DL(DL), TLI(TLI) { + Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI, + InvariantInfo &InvInfo) + : DL(DL), TLI(TLI), InvInfo(InvInfo) { ValueStack.emplace_back(); } @@ -2243,6 +2244,7 @@ const DataLayout &DL; const TargetLibraryInfo *TLI; + InvariantInfo &InvInfo; }; } // anonymous namespace @@ -2494,8 +2496,12 @@ Size->getValue().getLimitedValue() >= DL.getTypeStoreSize(ElemTy)) { Invariants.insert(GV); - DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV - << "\n"); + DEBUG(dbgs() + << "Found a global var that is an invariant (constant): " + << *GV << "\n"); + } else if (InvInfo.GetStartInstruction(GV)) { + DEBUG(dbgs() << "Found a global var that is writeonce: " << *GV + << "\n"); } else { DEBUG(dbgs() << "Found a global var, but can not treat it as an " "invariant.\n"); @@ -2683,12 +2689,66 @@ } } +static void processInvariantIntrinsics(InvariantInfo &InvInfo, BasicBlock *BB); + +static void processInvariantIntrinsics(InvariantInfo &InvInfo, Function *F) { + for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB) + processInvariantIntrinsics(InvInfo, BB); +} + +void processInvariantIntrinsics(InvariantInfo &InvInfo, BasicBlock *BB) { + // Scan the block to process invariant intrinsics, tracing whatever + // call chain that can be traced. + BasicBlock::iterator CurInst = BB->begin(); + while (CurInst) { + if (isa(CurInst) || isa(CurInst)) { + CallSite CS(CurInst); + + if (IntrinsicInst *II = dyn_cast(CurInst)) + processInvariantIntrinsic(II, InvInfo); + + // Ignore debug info, inline asm, intrinsics, ... + if (isa(CS.getInstruction()) || + isa(CS.getCalledValue()) || + dyn_cast(CS.getInstruction())) { + ++CurInst; + continue; + } + + Function *Callee = CS.getCalledFunction(); + if (Callee && !Callee->isDeclaration()) + processInvariantIntrinsics(InvInfo, Callee); + + if (InvokeInst *II = dyn_cast(CurInst)) { + for (unsigned i = 0; i < II->getNumSuccessors(); ++i) + processInvariantIntrinsics(InvInfo, II->getSuccessor(i)); + break; + } + } else if (TerminatorInst *TI = dyn_cast(CurInst)) { + for (unsigned i = 0; i < TI->getNumSuccessors(); ++i) + processInvariantIntrinsics(InvInfo, TI->getSuccessor(i)); + break; + } + + ++CurInst; + } +} + /// EvaluateStaticConstructor - Evaluate static constructors in the function, if /// we can. Return true if we can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + InvariantInfo &InvInfo) { + // Scan the Function's blocks to process invariant (start) intrinsics. + // This will mark writeonce global variables as written, and is necessary to + // do here because EvaluateBlock(), via EvaluateFunction() (below), could + // exit before processing the invariant intrinsic call, e.g., if a call to + // a function declaration that we cannot constant fold occurs before the + // intrinsic call. + processInvariantIntrinsics(InvInfo, F); + // Call the function. - Evaluator Eval(DL, TLI); + Evaluator Eval(DL, TLI, InvInfo); Constant *RetValDummy; bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy, SmallVector()); @@ -3039,6 +3099,7 @@ auto &DL = M.getDataLayout(); TLI = &getAnalysis().getTLI(); + auto &InvInfo = M.getInvariantInfo(); bool LocalChange = true; while (LocalChange) { @@ -3063,7 +3124,7 @@ // Optimize global_ctors list. LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { - return EvaluateStaticConstructor(F, DL, TLI); + return EvaluateStaticConstructor(F, DL, TLI, InvInfo); }); // Optimize non-address-taken globals. Index: test/Transforms/LoadElim/global-local-vars.ll =================================================================== --- /dev/null +++ test/Transforms/LoadElim/global-local-vars.ll @@ -0,0 +1,341 @@ + +;; NOTE: The CHECKLOAD-* prefixes indicate occurences of redundant loads in the output. +;; The CHECK-* prefixes indicate removal of redundant loads in the output. (ALL == 4-5A1-5A2-5B) + +;; * When the available load scan limit is 6, -instcombine does not +;; eliminate some redundant loads that either it would eliminate +;; with a load scan limit of 8, or -gvn would eliminate. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5B --check-prefix=CHECKLOAD-4-5A +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL + +;; * Adding '-inline -early-cse' enables a few more load eliminations, +;; but does not merge the same loads into the store as per local vars. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL + +;; * When the load scan limit is 8, +;; '-functionattrs -tailcallelim -instcombine' may be as good as '-gvn'. +;; But the same can't be said when the limit is 6. +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-4-5A-5B1 --check-prefix=CHECKLOAD-5B2 + +%struct.A = type { i32 } + +@_ZL1i = internal global %struct.A zeroinitializer +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global_local, i8* null }] + +;; Example 2: Unnecessary stores and loads. +;; void ex2() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor j. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z3ex2v() { +; CHECK: @_Z3ex2v( +entry: + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %j to i8* + %3 = call {}* @llvm.invariant.start(i64 4, i8* %2) + ; CHECK-NOT: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %4 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %5) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %6 = bitcast %struct.A* %agg.tmp1 to i8* + %7 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @llvm.invariant.end({}* %3, i64 4, i8* %2) + ; CHECK-NOT: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %9 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %9) + ret void +} + +;; Example 3: Necessary stores and loads. +;; void ex3() { +;; const Type i(1); +;; Type k = i; // Note: i == k, &i != &k ==> Keep store. +;; bar(i); // First load. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex3v() { +; CHECK: @_Z3ex3v( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* %k) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %6) + %7 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %7) + ret void +} + +;; Example 4: Smart stores and loads. +;; void ex4() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor k. +;; bar(k); // No load; Reuse i location. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex4v() { +; CHECK: @_Z3ex4v( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %6) + call void @_Z3fooPK1A(%struct.A* %k) + %7 = bitcast %struct.A* %agg.tmp3 to i8* + %8 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %9 = load i32, i32* %coerce.dive4 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %9) + %10 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %10) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5a() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; bar(k); // No load; Reuse i location. +;; foo2(&k, &i); // Does not change i; May change k. +;; bar(i); // No load. +;; bar(k); // Keep load. +;; } +define void @_Z4ex5av() { +; CHECK: @_Z4ex5av( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %6) + call void @_Z4foo2PK1AS1_(%struct.A* %k, %struct.A* @_ZL1i) + %7 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive4 + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + %9 = bitcast %struct.A* %agg.tmp5 to i8* + %10 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %10, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %11 = load i32, i32* %coerce.dive6 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %11) + %12 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %12) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5b() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; bar(j); // No load; Reuse i location. +;; foo2(&j, &i); // Does not change i, nor j. +;; bar(i); // No load. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z4ex5bv() { +; CHECK: @_Z4ex5bv( +entry: + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %j to i8* + ; CHECK-4-5A1-5B: load i32, i32* + ; CHECK-5A2-5B: load i32, i32* + ; CHECK-ALL: load i32, i32* + %3 = call {}* @llvm.invariant.start(i64 4, i8* %2) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %4 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive + ; CHECK-5B: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp1 to i8* + %7 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @_Z4foo2PK1AS1_(%struct.A* %j, %struct.A* @_ZL1i) + %9 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %10 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %10) + %11 = bitcast %struct.A* %agg.tmp5 to i8* + %12 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* %12, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %13 = load i32, i32* %coerce.dive6 + ; CHECKLOAD-5B2: load i32, i32* + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %13) + call void @llvm.invariant.end({}* %3, i64 4, i8* %2) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %14 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %14) + ret void +} + +define internal void @__cxx_global_var_init() { +entry: + %call = call i32 @_Z3onev() + call void @_ZN1AC1Ei(%struct.A* @_ZL1i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1i to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +declare i32 @_Z3onev() +declare void @_Z3bar1A(i32) +declare void @_Z3fooPK1A(%struct.A*) +declare void @_Z4foo2PK1AS1_(%struct.A*, %struct.A*) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare {}* @llvm.invariant.start(i64, i8* nocapture) +declare void @llvm.invariant.end({}*, i64, i8* nocapture) +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +define internal void @_GLOBAL__sub_I_global_local() { +entry: + call void @__cxx_global_var_init() + ret void +} + +define linkonce_odr void @_ZN1AC1Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %0 = load i32, i32* %a.addr + call void @_ZN1AC2Ei(%struct.A* %this1, i32 %0) + ret void +} + +define linkonce_odr void @_ZN1AC2Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %a2 = getelementptr inbounds %struct.A, %struct.A* %this1, i32 0, i32 0 + %0 = load i32, i32* %a.addr + store i32 %0, i32* %a2 + ret void +} Index: test/Transforms/LoadElim/global-vars.ll =================================================================== --- /dev/null +++ test/Transforms/LoadElim/global-vars.ll @@ -0,0 +1,327 @@ + +;; NOTE: The CHECKLOAD-* prefixes indicate occurences of redundant loads in the output, +;; expected when the loads are not from global variables. +;; The CHECK-* prefixes indicate removal of redundant loads in the output, +;; expected when the loads are not from global variables. +;; (ALL == 4-5A1-5A2-5B) + +;; * When the available load scan limit is 6, -instcombine does not +;; eliminate some redundant loads that either it would eliminate +;; with a load scan limit of 8, or -gvn would eliminate. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL + +;; * Adding '-inline -early-cse' enables a few more load eliminations, +;; but does not merge the same loads into the store as per local vars. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL + +;; * When the load scan limit is 8, +;; '-functionattrs -tailcallelim -instcombine' may be as good as '-gvn'. +;; But the same can't be said when the limit is 6. +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-4-5A-5B1 --check-prefix=CHECKLOAD-5B2 + +%struct.A = type { i32 } + +@_ZL1i = internal global %struct.A zeroinitializer +@k = global %struct.A zeroinitializer +@_ZL1j = internal global %struct.A zeroinitializer +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global, i8* null }] + +;; Example 1: Duplicate loads. +;; void ex1() { +;; const Type i(one()); +;; bar(i); // First load. +;; foo(&i); // Does not change i. +;; bar(i); // No load. +;; } +define void @_Z3ex1v() { +; CHECK: @_Z3ex1v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 2: Unnecessary stores and loads. +;; void ex2() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor j. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z3ex2v() { +; CHECK: @_Z3ex2v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 3: Necessary stores and loads. +;; void ex3() { +;; const Type i(1); +;; Type k = i; // Note: i == k, &i != &k ==> Keep store. +;; bar(i); // First load. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex3v() { +; CHECK: @_Z3ex3v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @k) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 4: Smart stores and loads. +;; void ex4() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor k. +;; bar(k); // No load; Reuse i location. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex4v() { +; CHECK: @_Z3ex4v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECK-4-5A-5B1: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* @k) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %5) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5a() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; bar(k); // No load; Reuse i location. +;; foo2(&k, &i); // Does not change i; May change k. +;; bar(i); // No load. +;; bar(k); // Keep load. +;; } +define void @_Z4ex5av() { +; CHECK: @_Z4ex5av( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECK-4-5A-5B1: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z4foo2PK1AS1_(%struct.A* @k, %struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %7 = load i32, i32* %coerce.dive6 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %7) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5b() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; bar(j); // No load; Reuse i location. +;; foo2(&j, &i); // Does not change i, nor j. +;; bar(i); // No load. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z4ex5bv() { +; CHECK: @_Z4ex5bv( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK-5A2-5B: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %1) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z4foo2PK1AS1_(%struct.A* @_ZL1j, %struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %7 = load i32, i32* %coerce.dive6 + ; CHECKLOAD-5B2: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %7) + ret void +} + +define internal void @__cxx_global_var_init() { +entry: + %call = call i32 @_Z3onev() + call void @_ZN1AC1Ei(%struct.A* @_ZL1i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1i to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +define internal void @__cxx_global_var_init.2() { +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.A* @_ZL1j to i8*), i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1j to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +declare i32 @_Z3onev() +declare void @_Z3bar1A(i32) +declare void @_Z3fooPK1A(%struct.A*) +declare void @_Z4foo2PK1AS1_(%struct.A*, %struct.A*) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare {}* @llvm.invariant.start(i64, i8* nocapture) +declare void @llvm.invariant.end({}*, i64, i8* nocapture) +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +define internal void @_GLOBAL__sub_I_global() { +entry: + call void @__cxx_global_var_init() + call void @__cxx_global_var_init.2() + call void @__cxx_global_var_init.1() + ret void +} + +define internal void @__cxx_global_var_init.1() { +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.A* @k to i8*), i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + ret void +} + +define linkonce_odr void @_ZN1AC1Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %0 = load i32, i32* %a.addr + call void @_ZN1AC2Ei(%struct.A* %this1, i32 %0) + ret void +} + +define linkonce_odr void @_ZN1AC2Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %a2 = getelementptr inbounds %struct.A, %struct.A* %this1, i32 0, i32 0 + %0 = load i32, i32* %a.addr + store i32 %0, i32* %a2 + ret void +}