Index: include/llvm/IR/GlobalVariable.h =================================================================== --- include/llvm/IR/GlobalVariable.h +++ include/llvm/IR/GlobalVariable.h @@ -31,6 +31,7 @@ class Constant; template class SymbolTableListTraits; +class IntrinsicInst; class GlobalVariable : public GlobalObject, public ilist_node { friend class SymbolTableListTraits; @@ -45,6 +46,10 @@ // can change from its initial // value before global // initializers are run? + IntrinsicInst *InvariantStartInst; // Transient + + void checkInvariantStartInstruction(IntrinsicInst *II); + public: // allocate space for exactly one operand void *operator new(size_t s) { @@ -144,6 +149,14 @@ bool isConstant() const { return isConstantGlobal; } void setConstant(bool Val) { isConstantGlobal = Val; } + IntrinsicInst *getInvariantStartInstruction() const { + return InvariantStartInst; + } + void setInvariantStartInstruction(IntrinsicInst *II) { + checkInvariantStartInstruction(II); + InvariantStartInst = II; + } + bool isExternallyInitialized() const { return isExternallyInitializedConstant; } Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -34,6 +34,7 @@ class ConstantRange; class DataLayout; class LLVMContext; +class IntrinsicInst; enum AtomicOrdering { NotAtomic = 0, @@ -75,6 +76,9 @@ /// class AllocaInst : public UnaryInstruction { Type *AllocatedType; + IntrinsicInst *InvariantStartInst; // Transient. + + void checkInvariantStartInstruction(IntrinsicInst *II); protected: // Note: Instruction needs to be a friend here to call cloneImpl. @@ -132,6 +136,14 @@ } void setAlignment(unsigned Align); + IntrinsicInst *getInvariantStartInstruction() const { + return InvariantStartInst; + } + void setInvariantStartInstruction(IntrinsicInst *II) { + checkInvariantStartInstruction(II); + InvariantStartInst = II; + } + /// isStaticAlloca - Return true if this alloca is in the entry block of the /// function and is a constant size. If so, the code generator will fold it /// into the prolog/epilog code, so it is basically free. @@ -4855,6 +4867,41 @@ } }; +//===----------------------------------------------------------------------===// +// Processing invariant_start/end intrinsics +//===----------------------------------------------------------------------===// + +IntrinsicInst *getInvariantStartInstruction(const Value* Arg); +void setInvariantStartInstruction(Value* Arg, IntrinsicInst *Val); +bool processInvariantIntrinsics(IntrinsicInst* II); + +struct PreservedInvariantInfo { + IntrinsicInst *II; + Value *Load; + PreservedInvariantInfo(): II(nullptr), Load(nullptr) { } +}; + +class PreserveInvariantInfo { + IntrinsicInst *PreservedII; + Value *LoadI; + void CheckPreservedInfo(); +public: + PreserveInvariantInfo(IntrinsicInst *II, Value *LI) + : PreservedII(II), LoadI(LI) { + if (PreservedII) + CheckPreservedInfo(); + } + ~PreserveInvariantInfo() { + if (PreservedII) + setInvariantStartInstruction(LoadI, PreservedII); + } +}; + +void setPreservedInvariantInfo(PreservedInvariantInfo &Preserved, + BasicBlock::iterator &ScanBackwardFrom, + Value *Query, BasicBlock *BB, + bool SkipToInvariantStart = false); + } // End llvm namespace #endif Index: lib/Analysis/BasicAliasAnalysis.cpp =================================================================== --- lib/Analysis/BasicAliasAnalysis.cpp +++ lib/Analysis/BasicAliasAnalysis.cpp @@ -488,12 +488,16 @@ if (OrLocal && isa(V)) continue; + if (const AllocaInst *AI = dyn_cast(V)) + if (AI->getInvariantStartInstruction()) + continue; + // A global constant counts as local memory for our purposes. if (const GlobalVariable *GV = dyn_cast(V)) { // Note: this doesn't require GV to be "ODR" because it isn't legal for a // global to be marked constant in some modules and non-constant in // others. GV may even be a declaration, not a definition. - if (!GV->isConstant()) { + if (!GV->isConstant() && !GV->getInvariantStartInstruction()) { Visited.clear(); return AAResultBase::pointsToConstantMemory(Loc, OrLocal); } @@ -670,6 +674,12 @@ return Alias; } +static bool isInvariantIntrinsic(ImmutableCallSite CS) { + const IntrinsicInst *II = dyn_cast(CS.getInstruction()); + return II && (II->getIntrinsicID() == Intrinsic::invariant_start || + II->getIntrinsicID() == Intrinsic::invariant_end); +} + /// Checks to see if the specified callsite can clobber the specified memory /// object. /// @@ -731,6 +741,10 @@ if (isAssumeIntrinsic(CS)) return MRI_NoModRef; + // *.invariant.* intrinsics follow the same pattern as assume intrinsic. + if (isInvariantIntrinsic(CS)) + return MRI_NoModRef; + // The AAResultBase base class has some smarts, lets use them. return AAResultBase::getModRefInfo(CS, Loc); } @@ -743,6 +757,10 @@ if (isAssumeIntrinsic(CS1) || isAssumeIntrinsic(CS2)) return MRI_NoModRef; + // *.invariant.* intrinsics follow the same pattern as assume intrinsic. + if (isInvariantIntrinsic(CS1) || isInvariantIntrinsic(CS2)) + return MRI_NoModRef; + // The AAResultBase base class has some smarts, lets use them. return AAResultBase::getModRefInfo(CS1, CS2); } Index: lib/Analysis/Loads.cpp =================================================================== --- lib/Analysis/Loads.cpp +++ lib/Analysis/Loads.cpp @@ -167,8 +167,10 @@ /// threading in part by eliminating partially redundant loads. /// At that point, the value of MaxInstsToScan was already set to '6' /// without documented explanation. +/// We have bumped up this number to '8' to improve the chnages of +/// behaviorial match when -instcombine is run after or without -inline. cl::opt -llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden, +llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(8), cl::Hidden, cl::desc("Use this to specify the default maximum number of instructions " "to scan backward from a given instruction, when searching for " "available loaded value")); @@ -208,6 +210,15 @@ Value *StrippedPtr = Ptr->stripPointerCasts(); + + // We're about to scan backwards. Preserve the initial invariant_start + // intrinsic marking on this load, for subsequent instructions. + // First, compute the info to preserve (and do not skip any instruction). + // Then, actually preserve the info before backward scanning starts. + PreservedInvariantInfo Preserved; + setPreservedInvariantInfo(Preserved, ScanFrom, StrippedPtr, ScanBB); + PreserveInvariantInfo PIO(Preserved.II, Preserved.Load); + while (ScanFrom != ScanBB->begin()) { // We must ignore debug info directives when counting (otherwise they // would affect codegen). @@ -215,6 +226,18 @@ if (isa(Inst)) continue; + // Same for invariant intrinsics. + if (IntrinsicInst *II = dyn_cast(Inst)) { + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + if (II == Preserved.II) + // We did not skip any instruction earlier. So, we must express that + // the given load is no longer pointing to constant memory. + llvm::setInvariantStartInstruction(Preserved.Load, nullptr); + continue; + } else if (II->getIntrinsicID() == Intrinsic::invariant_end) + continue; + } + // Restore ScanFrom to expected value in case next test succeeds ScanFrom++; Index: lib/Analysis/MemoryDependenceAnalysis.cpp =================================================================== --- lib/Analysis/MemoryDependenceAnalysis.cpp +++ lib/Analysis/MemoryDependenceAnalysis.cpp @@ -423,6 +423,17 @@ isInvariantLoad = true; } + // We're about to scan backwards. Preserve the initial invariant_start + // intrinsic marking on this load, for subsequent instructions. + // First, compute the info to preserve and prepare to skip instructions + // that need no further processing. + // Then, actually preserve the info before backward scanning starts. + PreservedInvariantInfo Preserved; + if (isLoad && QueryInst) + setPreservedInvariantInfo(Preserved, ScanIt, QueryInst, BB, + /*SkipToInvariantStart =*/ true); + PreserveInvariantInfo PIO(Preserved.II, Preserved.Load); + const DataLayout &DL = BB->getModule()->getDataLayout(); // Create a numbered basic block to lazily compute and cache instruction Index: lib/IR/Globals.cpp =================================================================== --- lib/IR/Globals.cpp +++ lib/IR/Globals.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Support/ErrorHandling.h" @@ -143,6 +144,11 @@ // GlobalVariable Implementation //===----------------------------------------------------------------------===// +void GlobalVariable::checkInvariantStartInstruction(IntrinsicInst *II) { + assert((!II || II->getIntrinsicID() == Intrinsic::invariant_start) && + "Given intrinsic instruction is not invariant_start"); +} + GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, ThreadLocalMode TLMode, unsigned AddressSpace, @@ -151,7 +157,8 @@ OperandTraits::op_begin(this), InitVal != nullptr, Link, Name, AddressSpace), isConstantGlobal(constant), - isExternallyInitializedConstant(isExternallyInitialized) { + isExternallyInitializedConstant(isExternallyInitialized), + InvariantStartInst(nullptr) { setThreadLocalMode(TLMode); if (InitVal) { assert(InitVal->getType() == Ty && @@ -169,7 +176,8 @@ OperandTraits::op_begin(this), InitVal != nullptr, Link, Name, AddressSpace), isConstantGlobal(constant), - isExternallyInitializedConstant(isExternallyInitialized) { + isExternallyInitializedConstant(isExternallyInitialized), + InvariantStartInst(nullptr) { setThreadLocalMode(TLMode); if (InitVal) { assert(InitVal->getType() == Ty && Index: lib/IR/Instructions.cpp =================================================================== --- lib/IR/Instructions.cpp +++ lib/IR/Instructions.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Support/ErrorHandling.h" @@ -1150,6 +1151,11 @@ // AllocaInst Implementation //===----------------------------------------------------------------------===// +void AllocaInst::checkInvariantStartInstruction(IntrinsicInst *II) { + assert((!II || II->getIntrinsicID() == Intrinsic::invariant_start) && + "Given intrinsic instruction is not invariant_start"); +} + static Value *getAISize(LLVMContext &Context, Value *Amt) { if (!Amt) Amt = ConstantInt::get(Type::getInt32Ty(Context), 1); @@ -1180,7 +1186,7 @@ const Twine &Name, Instruction *InsertBefore) : UnaryInstruction(PointerType::getUnqual(Ty), Alloca, getAISize(Ty->getContext(), ArraySize), InsertBefore), - AllocatedType(Ty) { + AllocatedType(Ty), InvariantStartInst(nullptr) { setAlignment(Align); assert(!Ty->isVoidTy() && "Cannot allocate void!"); setName(Name); @@ -1190,7 +1196,7 @@ const Twine &Name, BasicBlock *InsertAtEnd) : UnaryInstruction(PointerType::getUnqual(Ty), Alloca, getAISize(Ty->getContext(), ArraySize), InsertAtEnd), - AllocatedType(Ty) { + AllocatedType(Ty), InvariantStartInst(nullptr) { setAlignment(Align); assert(!Ty->isVoidTy() && "Cannot allocate void!"); setName(Name); @@ -4006,3 +4012,82 @@ LLVMContext &Context = getContext(); return new UnreachableInst(Context); } + +//===----------------------------------------------------------------------===// +// Processing invariant_start/end intrinsics +//===----------------------------------------------------------------------===// + +void llvm::PreserveInvariantInfo::CheckPreservedInfo() { + assert(PreservedII->getIntrinsicID() == Intrinsic::invariant_start && + "Preserved instruction must be an invariant_start intrinsic"); + assert(LoadI && "Can't preserve an intrinsic instruction for no load."); +} + +/// If the given Query is a load from writeonce readonly memory, we can speed +/// backward scaning by jumping to the associated invariant_start instruction, +/// *if* the instruction is in the BB block. +/// Then, since we would be scanning backward, undo the invariant_start +/// intrinsic marking so that getModRefInfo() knows that the load does not +/// point to constant memory from this point on. +void llvm::setPreservedInvariantInfo(PreservedInvariantInfo &Preserved, + BasicBlock::iterator &ScanBackwardFrom, + Value *Query, BasicBlock *BB, + bool SkipToInvariantStart) { + if (LoadInst *LI = dyn_cast(Query)) { + Value *I = LI->getPointerOperand()->stripPointerCasts(); + if (IntrinsicInst *II = llvm::getInvariantStartInstruction(I)) { + // NOTE: II's parent block can only either be the same as or contain + // ScanIt's parent block (i.e. BB). When not the same, set + // ScanIt to the begin of BB to skip the backward traversal. + if (II->getParent() == ScanBackwardFrom->getParent()) { + // If there is anything to scan backward at all, then we must still + // be processing BB, and this load no longer points to constant + // memory. Record info to preserve before reseting it. + Preserved.II = II; + Preserved.Load = I; + if (SkipToInvariantStart) { + ScanBackwardFrom = II; + llvm::setInvariantStartInstruction(I, nullptr); + } + } + else if (SkipToInvariantStart) + ScanBackwardFrom = BB->begin(); + } + } +} + +IntrinsicInst *llvm::getInvariantStartInstruction(const Value* Arg) { + if (!Arg) return nullptr; + if (const AllocaInst* AI = dyn_cast(Arg)) + return AI->getInvariantStartInstruction(); + if (const GlobalVariable* GV = dyn_cast(Arg)) + if (!GV->isConstant()) + return GV->getInvariantStartInstruction(); + return nullptr; +} + +void llvm::setInvariantStartInstruction(Value* Arg, IntrinsicInst *Val) { + if (!Arg) return; + if (GlobalVariable* GV = dyn_cast(Arg)) + GV->setInvariantStartInstruction(Val); + if (AllocaInst* AI = dyn_cast(Arg)) + AI->setInvariantStartInstruction(Val); +} + +// Process @llvm.invariant.start/end intrinsics. +bool llvm::processInvariantIntrinsics(IntrinsicInst* II) { + assert(II && "Can't mark a null instruction."); + + if (II->getIntrinsicID() == Intrinsic::invariant_start) { + llvm::Value *Addr = II->getArgOperand(1)->stripPointerCasts(); + setInvariantStartInstruction(Addr, II); + return true; + } + else if (II->getIntrinsicID() == Intrinsic::invariant_end) { + llvm::Value *Addr = II->getArgOperand(2)->stripPointerCasts(); + if (getInvariantStartInstruction(Addr)) + setInvariantStartInstruction(Addr, nullptr); + return true; + } + return false; +} Index: lib/Transforms/IPO/GlobalOpt.cpp =================================================================== --- lib/Transforms/IPO/GlobalOpt.cpp +++ lib/Transforms/IPO/GlobalOpt.cpp @@ -2191,10 +2191,13 @@ /// control flows into, or null upon return. bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB); - Constant *getVal(Value *V) { + Constant *getVal(Value *V, bool CheckComputed = true) { if (Constant *CV = dyn_cast(V)) return CV; Constant *R = ValueStack.back().lookup(V); - assert(R && "Reference to an uncomputed value!"); + + // Allow references to uncomputed values from processInvariantIntrinsics(). + if (CheckComputed) + assert(R && "Reference to an uncomputed value!"); return R; } @@ -2210,6 +2213,10 @@ return Invariants; } + const SmallPtrSetImpl &getReadOnlys() const { + return ReadOnlys; + } + private: Constant *ComputeLoadResult(Constant *P); @@ -2237,6 +2244,10 @@ /// static constructor. SmallPtrSet Invariants; + /// ReadOnlys - These global variables are writeonce variables that + /// have been marked readonly by the static constructor. + SmallPtrSet ReadOnlys; + /// SimpleConstants - These are constants we have checked and know to be /// simple enough to live in a static initializer of a global. SmallPtrSet SimpleConstants; @@ -2494,8 +2505,13 @@ Size->getValue().getLimitedValue() >= DL.getTypeStoreSize(ElemTy)) { Invariants.insert(GV); - DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV + DEBUG(dbgs() << "Found a global var that is an invariant (constant): " << *GV << "\n"); + } + else if (GV->getInvariantStartInstruction()) { + ReadOnlys.insert(GV); + DEBUG(dbgs() << "Found a global var that is a readonly writeonce: " << *GV + << "\n"); } else { DEBUG(dbgs() << "Found a global var, but can not treat it as an " "invariant.\n"); @@ -2619,6 +2635,84 @@ } } +static void processInvariantIntrinsics(Evaluator &Eval, Function *F) { + + // Scan the block to process invariant intrinsics, tracing whatever + // call chain that can be traced. + // Without this, invariant intrinsics on global variables, can only be + // processed when the constructor calls are inlined. + // TODO: Instead of forcing this tracing, can we rely on -O1's -always-inline + // or -O2's -inline? + BasicBlock *BB = F->begin(); + while (BB) { + BasicBlock* NextBB = nullptr; + BasicBlock::iterator CurInst = BB->begin(); + + while (CurInst) { + + if (isa(CurInst) || isa(CurInst)) { + CallSite CS(CurInst); + + if (IntrinsicInst *II = dyn_cast(CurInst)) + processInvariantIntrinsics(II); + + // Debug info, inline asm, intrinsics, ... + // can safely be ignored here. + if (isa(CS.getInstruction()) || + isa(CS.getCalledValue()) || + dyn_cast(CS.getInstruction())) { + ++CurInst; + continue; + } + Function *Callee = + dyn_cast_or_null(Eval.getVal(CS.getCalledValue(), + /*CheckComputed =*/ false)); + if (!Callee || Callee->mayBeOverridden()) + break; + + if (!Callee->isDeclaration() && + !Callee->getFunctionType()->isVarArg()) { + processInvariantIntrinsics(Eval, Callee); + } + } else if (isa(CurInst)) { + if (BranchInst *BI = dyn_cast(CurInst)) { + if (BI->isUnconditional()) + NextBB = BI->getSuccessor(0); + else { + if (ConstantInt *Cond = + dyn_cast_or_null(Eval.getVal(BI->getCondition(), + /*CheckComputed =*/ false))) + NextBB = BI->getSuccessor(!Cond->getZExtValue()); + } + } else if (SwitchInst *SI = dyn_cast(CurInst)) { + if(ConstantInt *Val = + dyn_cast_or_null( + Eval.getVal(SI->getCondition(), /*CheckComputed =*/ false))) + NextBB = SI->findCaseValue(Val).getCaseSuccessor(); + } else if (IndirectBrInst *IBI = dyn_cast(CurInst)) { + Value *Val = + Eval.getVal(IBI->getAddress(), + /*CheckComputed =*/ false)->stripPointerCasts(); + if (BlockAddress *BA = dyn_cast_or_null(Val)) + NextBB = BA->getBasicBlock(); + NextBB = nullptr; + } else if (isa(CurInst)) + NextBB = nullptr; + break; + } + + if (InvokeInst *II = dyn_cast(CurInst)) { + NextBB = II->getNormalDest(); + break; + } + + ++CurInst; + } + + BB = NextBB; + } +} + /// EvaluateFunction - Evaluate a call to function F, returning true if /// successful, false if we can't evaluate it. ActualArgs contains the formal /// arguments for the function. @@ -2647,6 +2741,10 @@ BasicBlock::iterator CurInst = CurBB->begin(); + // Scan the block to process invariant intrinsics. + // This will mark 'writeonce' global variables as written. + processInvariantIntrinsics(*this, F); + while (1) { BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings. DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n"); @@ -2706,6 +2804,11 @@ CommitValueTo(I->second, I->first); for (GlobalVariable *GV : Eval.getInvariants()) GV->setConstant(true); + + for (GlobalVariable *GV : Eval.getReadOnlys()) { + assert(GV->getInvariantStartInstruction() && + "Only readonly writeonce global vars are allowed here."); + } } return EvalSuccess; Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -470,9 +470,10 @@ for (Instruction &I : BB) { CallSite CS(cast(&I)); // If this isn't a call, or it is a call to an intrinsic, it can - // never be inlined. + // never be inlined. invariant_start/end intrinsics are excepted + // because they should be processed when inlining other calls. if (!CS || isa(I)) - continue; + continue; // If this is a direct call to an external function, we can never inline // it. If it is an indirect call, inlining may resolve it to be a @@ -514,7 +515,7 @@ // CallSites may be modified inside so ranged for loop can not be used. for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { CallSite CS = CallSites[CSi].first; - + Function *Caller = CS.getCaller(); Function *Callee = CS.getCalledFunction(); Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1441,6 +1441,10 @@ break; } + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + processInvariantIntrinsics(II); + break; case Intrinsic::experimental_gc_relocate: { // Translate facts known about a pointer before relocating into // facts about the relocate value, while being careful to Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1957,6 +1957,17 @@ uint64_t DontKnow = CI->isZero() ? -1ULL : 0; ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow)); } + + // If this is a paired invariant_start, then erase its invariant_end. + if (II->getIntrinsicID() == Intrinsic::invariant_start && + !I->use_empty()) { + IntrinsicInst *User = + dyn_cast(cast(*I->user_begin())); + assert(I->hasOneUse() && User && + User->getIntrinsicID() == Intrinsic::invariant_end && + "The paired instruction should be an invariant_end."); + EraseInstFromFunction(*User); + } } EraseInstFromFunction(*I); } Index: lib/Transforms/Scalar/GVN.cpp =================================================================== --- lib/Transforms/Scalar/GVN.cpp +++ lib/Transforms/Scalar/GVN.cpp @@ -2280,6 +2280,11 @@ if (isa(I)) return false; + if (IntrinsicInst *IntrinsicI = dyn_cast(I)) { + if (processInvariantIntrinsics(IntrinsicI)) + return false; + } + // If the instruction can be easily simplified then do so now in preference // to value numbering it. Value numbering often exposes redundancies, for // example if it determines that %y is equal to %x then the instruction Index: test/Transforms/LoadElim/global-local-vars.ll =================================================================== --- /dev/null +++ test/Transforms/LoadElim/global-local-vars.ll @@ -0,0 +1,343 @@ + +;; NOTE: The CHECKLOAD-* prefixes indicate occurences of redundant loads in the output. +;; The CHECK-* prefixes indicate removal of redundant loads in the output. (ALL == 4-5A1-5A2-5B) + +;; * When the available load scan limit is 6, -instcombine does not +;; eliminate some redundant loads that either it would eliminate +;; with a load scan limit of 8, or -gvn would eliminate. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5B --check-prefix=CHECKLOAD-4-5A +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-4-5A1-5B --check-prefix=CHECKLOAD-5A2 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL + +;; * Adding '-inline -early-cse' enables a few more load eliminations, +;; but does not merge the same loads into the store as per local vars. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL + +;; * When the load scan limit is 8, +;; '-functionattrs -tailcallelim -instcombine' may be as good as '-gvn'. +;; But the same can't be said when the limit is 6. +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-4-5A-5B1 --check-prefix=CHECKLOAD-5B2 + +%struct.A = type { i32 } + +@_ZL1i = internal global %struct.A zeroinitializer +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global_local, i8* null }] + +;; Example 2: Unnecessary stores and loads. +;; void ex2() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor j. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z3ex2v() { +; CHECK: @_Z3ex2v( +entry: + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %j to i8* + %3 = call {}* @llvm.invariant.start(i64 4, i8* %2) + ; CHECK-NOT: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %4 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %5) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %6 = bitcast %struct.A* %agg.tmp1 to i8* + %7 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @llvm.invariant.end({}* %3, i64 4, i8* %2) + ; CHECK-NOT: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %9 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %9) + ret void +} + +;; Example 3: Necessary stores and loads. +;; void ex3() { +;; const Type i(1); +;; Type k = i; // Note: i == k, &i != &k ==> Keep store. +;; bar(i); // First load. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex3v() { +; CHECK: @_Z3ex3v( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* %k) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %6) + %7 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %7) + ret void +} + +;; Example 4: Smart stores and loads. +;; void ex4() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor k. +;; bar(k); // No load; Reuse i location. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex4v() { +; CHECK: @_Z3ex4v( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %6) + call void @_Z3fooPK1A(%struct.A* %k) + %7 = bitcast %struct.A* %agg.tmp3 to i8* + %8 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %9 = load i32, i32* %coerce.dive4 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %9) + %10 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %10) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5a() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; bar(k); // No load; Reuse i location. +;; foo2(&k, &i); // Does not change i; May change k. +;; bar(i); // No load. +;; bar(k); // Keep load. +;; } +define void @_Z4ex5av() { +; CHECK: @_Z4ex5av( +entry: + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + %4 = bitcast %struct.A* %agg.tmp1 to i8* + %5 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %6 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %6) + call void @_Z4foo2PK1AS1_(%struct.A* %k, %struct.A* @_ZL1i) + %7 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive4 + ; CHECKLOAD-4-5A: load i32, i32* + ; CHECKLOAD-5A2: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + %9 = bitcast %struct.A* %agg.tmp5 to i8* + %10 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %10, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %11 = load i32, i32* %coerce.dive6 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %11) + %12 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %12) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5b() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; bar(j); // No load; Reuse i location. +;; foo2(&j, &i); // Does not change i, nor j. +;; bar(i); // No load. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z4ex5bv() { +; CHECK: @_Z4ex5bv( +entry: + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %1 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %2 = bitcast %struct.A* %j to i8* + ; CHECK-4-5A1-5B: load i32, i32* + ; CHECK-5A2-5B: load i32, i32* + ; CHECK-ALL: load i32, i32* + %3 = call {}* @llvm.invariant.start(i64 4, i8* %2) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %4 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive + ; CHECK-5B: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp1 to i8* + %7 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @_Z4foo2PK1AS1_(%struct.A* %j, %struct.A* @_ZL1i) + %9 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %10 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %10) + %11 = bitcast %struct.A* %agg.tmp5 to i8* + %12 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* %12, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %13 = load i32, i32* %coerce.dive6 + ; CHECKLOAD-5B2: load i32, i32* + ; CHECK-4-5A1-5B-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %13) + call void @llvm.invariant.end({}* %3, i64 4, i8* %2) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %14 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %14) + ret void +} + +define internal void @__cxx_global_var_init() { +entry: + %call = call i32 @_Z3onev() + call void @_ZN1AC1Ei(%struct.A* @_ZL1i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1i to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +declare i32 @_Z3onev() +declare void @_Z3bar1A(i32) +declare void @_Z3fooPK1A(%struct.A*) +declare void @_Z4foo2PK1AS1_(%struct.A*, %struct.A*) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare {}* @llvm.invariant.start(i64, i8* nocapture) +declare void @llvm.invariant.end({}*, i64, i8* nocapture) +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +define internal void @_GLOBAL__sub_I_global_local() { +entry: + call void @__cxx_global_var_init() + ret void +} + +define linkonce_odr void @_ZN1AC1Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %0 = load i32, i32* %a.addr + call void @_ZN1AC2Ei(%struct.A* %this1, i32 %0) + ret void +} + +define linkonce_odr void @_ZN1AC2Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %a2 = getelementptr inbounds %struct.A, %struct.A* %this1, i32 0, i32 0 + %0 = load i32, i32* %a.addr + store i32 %0, i32* %a2 + ret void +} Index: test/Transforms/LoadElim/global-vars.ll =================================================================== --- /dev/null +++ test/Transforms/LoadElim/global-vars.ll @@ -0,0 +1,327 @@ + +;; NOTE: The CHECKLOAD-* prefixes indicate occurences of redundant loads in the output, +;; expected when the loads are not from global variables. +;; The CHECK-* prefixes indicate removal of redundant loads in the output, +;; expected when the loads are not from global variables. +;; (ALL == 4-5A1-5A2-5B) + +;; * When the available load scan limit is 6, -instcombine does not +;; eliminate some redundant loads that either it would eliminate +;; with a load scan limit of 8, or -gvn would eliminate. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL + +;; * Adding '-inline -early-cse' enables a few more load eliminations, +;; but does not merge the same loads into the store as per local vars. +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECK-ALL + +;; * When the load scan limit is 8, +;; '-functionattrs -tailcallelim -instcombine' may be as good as '-gvn'. +;; But the same can't be said when the limit is 6. +; RUN: opt < %s -globalopt -available-load-scan-limit=8 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ALL +; RUN: opt < %s -globalopt -available-load-scan-limit=6 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-4-5A-5B1 --check-prefix=CHECKLOAD-5B2 + +%struct.A = type { i32 } + +@_ZL1i = internal global %struct.A zeroinitializer +@k = global %struct.A zeroinitializer +@_ZL1j = internal global %struct.A zeroinitializer +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global, i8* null }] + +;; Example 1: Duplicate loads. +;; void ex1() { +;; const Type i(one()); +;; bar(i); // First load. +;; foo(&i); // Does not change i. +;; bar(i); // No load. +;; } +define void @_Z3ex1v() { +; CHECK: @_Z3ex1v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 2: Unnecessary stores and loads. +;; void ex2() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor j. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z3ex2v() { +; CHECK: @_Z3ex2v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 3: Necessary stores and loads. +;; void ex3() { +;; const Type i(1); +;; Type k = i; // Note: i == k, &i != &k ==> Keep store. +;; bar(i); // First load. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex3v() { +; CHECK: @_Z3ex3v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @k) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + ret void +} + +;; Example 4: Smart stores and loads. +;; void ex4() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor k. +;; bar(k); // No load; Reuse i location. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex4v() { +; CHECK: @_Z3ex4v( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + call void @_Z3fooPK1A(%struct.A* @_ZL1i) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECK-4-5A-5B1: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z3fooPK1A(%struct.A* @k) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %5) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5a() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; bar(k); // No load; Reuse i location. +;; foo2(&k, &i); // Does not change i; May change k. +;; bar(i); // No load. +;; bar(k); // Keep load. +;; } +define void @_Z4ex5av() { +; CHECK: @_Z4ex5av( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %1) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECK-4-5A-5B1: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z4foo2PK1AS1_(%struct.A* @k, %struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.A* @k to i8*), i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %7 = load i32, i32* %coerce.dive6 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %7) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5b() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; bar(j); // No load; Reuse i location. +;; foo2(&j, &i); // Does not change i, nor j. +;; bar(i); // No load. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z4ex5bv() { +; CHECK: @_Z4ex5bv( +entry: + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %1 = load i32, i32* %coerce.dive + ; CHECK-5A2-5B: load i32, i32* + ; CHECK-ALL: load i32, i32* + call void @_Z3bar1A(i32 %1) + %2 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %3 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %3) + call void @_Z4foo2PK1AS1_(%struct.A* @_ZL1j, %struct.A* @_ZL1i) + %4 = bitcast %struct.A* %agg.tmp3 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive4 + ; CHECK-4-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %5) + %6 = bitcast %struct.A* %agg.tmp5 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.A* @_ZL1j to i8*), i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %7 = load i32, i32* %coerce.dive6 + ; CHECKLOAD-5B2: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %7) + ret void +} + +define internal void @__cxx_global_var_init() { +entry: + %call = call i32 @_Z3onev() + call void @_ZN1AC1Ei(%struct.A* @_ZL1i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1i to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +define internal void @__cxx_global_var_init.2() { +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.A* @_ZL1j to i8*), i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + %0 = call {}* @llvm.invariant.start(i64 4, i8* bitcast (%struct.A* @_ZL1j to i8*)) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ret void +} + +declare i32 @_Z3onev() +declare void @_Z3bar1A(i32) +declare void @_Z3fooPK1A(%struct.A*) +declare void @_Z4foo2PK1AS1_(%struct.A*, %struct.A*) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare {}* @llvm.invariant.start(i64, i8* nocapture) +declare void @llvm.invariant.end({}*, i64, i8* nocapture) +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +define internal void @_GLOBAL__sub_I_global() { +entry: + call void @__cxx_global_var_init() + call void @__cxx_global_var_init.2() + call void @__cxx_global_var_init.1() + ret void +} + +define internal void @__cxx_global_var_init.1() { +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.A* @k to i8*), i8* bitcast (%struct.A* @_ZL1i to i8*), i64 4, i32 4, i1 false) + ret void +} + +define linkonce_odr void @_ZN1AC1Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %0 = load i32, i32* %a.addr + call void @_ZN1AC2Ei(%struct.A* %this1, i32 %0) + ret void +} + +define linkonce_odr void @_ZN1AC2Ei(%struct.A* %this, i32 %a) { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %a2 = getelementptr inbounds %struct.A, %struct.A* %this1, i32 0, i32 0 + %0 = load i32, i32* %a.addr + store i32 %0, i32* %a2 + ret void +} Index: test/Transforms/LoadElim/local-vars.ll =================================================================== --- /dev/null +++ test/Transforms/LoadElim/local-vars.ll @@ -0,0 +1,437 @@ + +;; NOTE: The CHECKLOAD-* prefixes indicate occurences of redundant loads in the output. +;; The CHECK-* prefixes indicate removal of redundant loads in the output. (ALL == 4-5A1-5A2-5B) + +;; * When the available load scan limit is 6, -instcombine does not +;; eliminate some redundant loads that either it would eliminate +;; with a load scan limit of 8, or -gvn would eliminate. +; RUN: opt < %s -available-load-scan-limit=6 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECKLOAD-ALL +; RUN: opt < %s -available-load-scan-limit=6 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECK-4-5A1-5B2 --check-prefix=CHECKLOAD-5A2-5B1 +; RUN: opt < %s -available-load-scan-limit=8 -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -available-load-scan-limit=8 -instcombine -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECK-ALL + +;; * Adding '-inline -early-cse' enables a few more load eliminations. +; RUN: opt < %s -available-load-scan-limit=6 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECKINV --check-prefix=CHECK-5B1 --check-prefix=CHECKLOAD-4-5A-5B2 +; RUN: opt < %s -available-load-scan-limit=6 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECKINV3 --check-prefix=CHECK-ALL +; RUN: opt < %s -available-load-scan-limit=8 -instcombine -inline -early-cse -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECKINV --check-prefix=CHECK-5A2-5B --check-prefix=CHECKLOAD-4-5A1 +; RUN: opt < %s -available-load-scan-limit=8 -instcombine -inline -early-cse -gvn -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKINL --check-prefix=CHECKINV3 --check-prefix=CHECK-ALL + +;; * When the load scan limit is 8, +;; '-functionattrs -tailcallelim -instcombine' may be as good as '-gvn'. +;; But the same can't be said when the limit is 6. +; RUN: opt < %s -available-load-scan-limit=8 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECK-ALL +; RUN: opt < %s -available-load-scan-limit=6 -instcombine -functionattrs -tailcallelim -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKNOINL --check-prefix=CHECK-5A-5B1 --check-prefix=CHECKLOAD-4-5B2 + + +%struct.A = type { i32 } + +;; Example 1: Duplicate loads. +;; void ex1() { +;; const Type i(one()); +;; bar(i); // First load. +;; foo(&i); // Does not change i. +;; bar(i); // No load. +;; } +define void @_Z3ex1v() { +entry: + %i = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %agg.tmp to i8* + %4 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* %4, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %5 = load i32, i32* %coerce.dive + ; CHECKNOINL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %5) + call void @_Z3fooPK1A(%struct.A* %i) + %6 = bitcast %struct.A* %agg.tmp1 to i8* + %7 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %9 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %9) + ret void +} + +;; Example 2: Unnecessary stores and loads. +;; void ex2() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor j. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z3ex2v() { +entry: + %i = alloca %struct.A + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %struct.A* %j to i8* + %5 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %6 = bitcast %struct.A* %j to i8* + %7 = call {}* @llvm.invariant.start(i64 4, i8* %6) + ; CHECK-NOT: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %8 = bitcast %struct.A* %agg.tmp to i8* + %9 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* %9, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %10 = load i32, i32* %coerce.dive + ; CHECKNOINL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %10) + call void @_Z3fooPK1A(%struct.A* %i) + %11 = bitcast %struct.A* %agg.tmp1 to i8* + %12 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* %12, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %13 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %13) + call void @llvm.invariant.end({}* %7, i64 4, i8* %6) + ; CHECK-NOT: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %14 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %14) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %15 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %15) + ret void +} + +;; Example 3: Necessary stores and loads. +;; void ex3() { +;; const Type i(1); +;; Type k = i; // Note: i == k, &i != &k ==> Keep store. +;; bar(i); // First load. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex3v() { +entry: + %i = alloca %struct.A + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECKNOINL: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ; CHECKINV3: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + ; CHECKINV-NOT: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %struct.A* %k to i8* + %5 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %6 = bitcast %struct.A* %agg.tmp to i8* + %7 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive + ; CHECKNOINL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @_Z3fooPK1A(%struct.A* %k) + %9 = bitcast %struct.A* %agg.tmp1 to i8* + %10 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %10, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %11 = load i32, i32* %coerce.dive2 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %11) + %12 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %12) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHEC K-ALL: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* ; FIXME: + ; CHECKNOINL: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + ; CHECKINV3: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + ; CHECKINV-NOT: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %13 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %13) + ret void +} + +;; Example 4: Smart stores and loads. +;; void ex4() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; foo(&i); // Does not change i, nor k. +;; bar(k); // No load; Reuse i location. +;; foo(&k); // Does not change i; May change k. +;; bar(k); // Keep load. +;; } +define void @_Z3ex4v() { +entry: + %i = alloca %struct.A + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %struct.A* %k to i8* + %5 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %6 = bitcast %struct.A* %agg.tmp to i8* + %7 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive + ; CHECKNOINL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + call void @_Z3fooPK1A(%struct.A* %i) + %9 = bitcast %struct.A* %agg.tmp1 to i8* + %10 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %10, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %11 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5B2: load i32, i32* + ; CHECKLOAD-4-5A-5B2: load i32, i32* + ; CHECKLOAD-ALL: load i32, i32* + ; CHECK-4-5A1-5B2-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %11) + call void @_Z3fooPK1A(%struct.A* %k) + %12 = bitcast %struct.A* %agg.tmp3 to i8* + %13 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %12, i8* %13, i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %14 = load i32, i32* %coerce.dive4 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %14) + %15 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %15) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %16 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %16) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5a() { +;; const Type i(one()); +;; Type k = i; // Note: i == k, &i != &k ==> May keep store. +;; bar(i); // First load. +;; bar(k); // No load; Reuse i location. +;; foo2(&k, &i); // Does not change i; May change k. +;; bar(i); // No load. +;; bar(k); // Keep load. +;; } +define void @_Z4ex5av() { +entry: + %i = alloca %struct.A + %k = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %struct.A* %k to i8* + %5 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %6 = bitcast %struct.A* %agg.tmp to i8* + %7 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %8 = load i32, i32* %coerce.dive + ; CHECKNOINL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %8) + %9 = bitcast %struct.A* %agg.tmp1 to i8* + %10 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %9, i8* %10, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %11 = load i32, i32* %coerce.dive2 + ; CHECKLOAD-4-5A1: load i32, i32* + ; CHECKLOAD-4-5A-5B2: load i32, i32* + ; CHECK-5A-5B1-NOT: load i32, i32* + ; CHECK-4-5A1-5B2-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %11) + call void @_Z4foo2PK1AS1_(%struct.A* %k, %struct.A* %i) + %12 = bitcast %struct.A* %agg.tmp3 to i8* + %13 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %12, i8* %13, i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %14 = load i32, i32* %coerce.dive4 + ; CHECKLOAD-4-5A-5B2: load i32, i32* + ; CHECKLOAD-5A2-5B1: load i32, i32* + ; CHECKLOAD-ALL: load i32, i32* + ; CHECK-5A-5B1-NOT: load i32, i32* + ; CHECK-5A2-NOT: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %14) + %15 = bitcast %struct.A* %agg.tmp5 to i8* + %16 = bitcast %struct.A* %k to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %15, i8* %16, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %17 = load i32, i32* %coerce.dive6 + ; CHECK: load i32, i32* + call void @_Z3bar1A(i32 %17) + %18 = bitcast %struct.A* %k to i8* + call void @llvm.lifetime.end(i64 4, i8* %18) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %19 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %19) + ret void +} + +;; Example 5: Duplicate and smart loads (and stores). +;; void ex5b() { +;; const Type i(one()); +;; const Type j = i; // Note: i == j, &i != &j ==> No store. +;; bar(i); // First load. +;; bar(j); // No load; Reuse i location. +;; foo2(&j, &i); // Does not change i, nor j. +;; bar(i); // No load. +;; bar(j); // No load; Reuse i location. +;; } +define void @_Z4ex5bv() { +entry: + %i = alloca %struct.A + %j = alloca %struct.A + %agg.tmp = alloca %struct.A + %agg.tmp1 = alloca %struct.A + %agg.tmp3 = alloca %struct.A + %agg.tmp5 = alloca %struct.A + %0 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) + %call = call i32 @_Z3onev() + call void @_ZN1AC2Ei(%struct.A* %i, i32 %call) + ; CHECKINL: store i32 {{.*}}, i32* + %1 = bitcast %struct.A* %i to i8* + %2 = call {}* @llvm.invariant.start(i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %3 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) + %4 = bitcast %struct.A* %j to i8* + %5 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 4, i32 4, i1 false) + %6 = bitcast %struct.A* %j to i8* + %7 = call {}* @llvm.invariant.start(i64 4, i8* %6) + ; CHECK: call {{.*}}@llvm.invariant.start(i64 {{[0-9]+}}, i8* + %8 = bitcast %struct.A* %agg.tmp to i8* + %9 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %8, i8* %9, i64 4, i32 4, i1 false) + %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %agg.tmp, i32 0, i32 0 + %10 = load i32, i32* %coerce.dive + ; CHECKNOINL-ALL: load i32, i32* + ; CHECKINL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %10) + %11 = bitcast %struct.A* %agg.tmp1 to i8* + %12 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %11, i8* %12, i64 4, i32 4, i1 false) + %coerce.dive2 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp1, i32 0, i32 0 + %13 = load i32, i32* %coerce.dive2 + ; CHECK-NOT: load i32, i32* + call void @_Z3bar1A(i32 %13) + call void @_Z4foo2PK1AS1_(%struct.A* %j, %struct.A* %i) + %14 = bitcast %struct.A* %agg.tmp3 to i8* + %15 = bitcast %struct.A* %i to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %14, i8* %15, i64 4, i32 4, i1 false) + %coerce.dive4 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp3, i32 0, i32 0 + %16 = load i32, i32* %coerce.dive4 + ; CHECKLOAD-5A2-5B1: load i32, i32* + ; CHECKLOAD-ALL: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-5A-5B1-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %16) + %17 = bitcast %struct.A* %agg.tmp5 to i8* + %18 = bitcast %struct.A* %j to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %17, i8* %18, i64 4, i32 4, i1 false) + %coerce.dive6 = getelementptr inbounds %struct.A, %struct.A* %agg.tmp5, i32 0, i32 0 + %19 = load i32, i32* %coerce.dive6 + ; CHECKLOAD-4-5B2: load i32, i32* + ; CHECKLOAD-4-5A-5B2: load i32, i32* + ; CHECKLOAD-ALL: load i32, i32* + ; CHECK-5A2-5B-NOT: load i32, i32* + ; CHECK-ALL-NOT: load i32, i32* + call void @_Z3bar1A(i32 %19) + call void @llvm.invariant.end({}* %7, i64 4, i8* %6) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %20 = bitcast %struct.A* %j to i8* + call void @llvm.lifetime.end(i64 4, i8* %20) + call void @llvm.invariant.end({}* %2, i64 4, i8* %1) + ; CHECK: call {{.*}}@llvm.invariant.end({{.*}}, i64 {{[0-9]+}}, i8* + %21 = bitcast %struct.A* %i to i8* + call void @llvm.lifetime.end(i64 4, i8* %21) + ret void +} + +declare i32 @_Z3onev() +declare void @_Z3bar1A(i32) +declare void @_Z3fooPK1A(%struct.A*) +declare void @_Z4foo2PK1AS1_(%struct.A*, %struct.A*) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) +declare {}* @llvm.invariant.start(i64, i8* nocapture) +declare void @llvm.invariant.end({}*, i64, i8* nocapture) +declare void @llvm.lifetime.start(i64, i8* nocapture) +declare void @llvm.lifetime.end(i64, i8* nocapture) + +define linkonce_odr void @_ZN1AC2Ei(%struct.A* %this, i32 %a) unnamed_addr { +entry: + %this.addr = alloca %struct.A* + %a.addr = alloca i32 + store %struct.A* %this, %struct.A** %this.addr + store i32 %a, i32* %a.addr + %this1 = load %struct.A*, %struct.A** %this.addr + %a2 = getelementptr inbounds %struct.A, %struct.A* %this1, i32 0, i32 0 + %0 = load i32, i32* %a.addr + store i32 %0, i32* %a2 + ret void +}