diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index ae7d92163733..00c0e187c341 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1,4223 +1,4223 @@ //===- ValueTracking.cpp - Walk computations to compute properties --------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains routines that help analyze properties that chains of // computations have. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/ValueTracking.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include using namespace llvm; using namespace llvm::PatternMatch; const unsigned MaxDepth = 6; /// Enable an experimental feature to leverage information about dominating /// conditions to compute known bits. The individual options below control how /// hard we search. The defaults are chosen to be fairly aggressive. If you /// run into compile time problems when testing, scale them back and report /// your findings. static cl::opt EnableDomConditions("value-tracking-dom-conditions", cl::Hidden, cl::init(false)); // This is expensive, so we only do it for the top level query value. // (TODO: evaluate cost vs profit, consider higher thresholds) static cl::opt DomConditionsMaxDepth("dom-conditions-max-depth", cl::Hidden, cl::init(1)); /// How many dominating blocks should be scanned looking for dominating /// conditions? static cl::opt DomConditionsMaxDomBlocks("dom-conditions-dom-blocks", cl::Hidden, cl::init(20)); // Controls the number of uses of the value searched for possible // dominating comparisons. static cl::opt DomConditionsMaxUses("dom-conditions-max-uses", cl::Hidden, cl::init(20)); // If true, don't consider only compares whose only use is a branch. static cl::opt DomConditionsSingleCmpUse("dom-conditions-single-cmp-use", cl::Hidden, cl::init(false)); /// Returns the bitwidth of the given scalar or pointer type (if unknown returns /// 0). For vector types, returns the element type's bitwidth. static unsigned getBitWidth(Type *Ty, const DataLayout &DL) { if (unsigned BitWidth = Ty->getScalarSizeInBits()) return BitWidth; return DL.getPointerTypeSizeInBits(Ty); } // Many of these functions have internal versions that take an assumption // exclusion set. This is because of the potential for mutual recursion to // cause computeKnownBits to repeatedly visit the same assume intrinsic. The // classic case of this is assume(x = y), which will attempt to determine // bits in x from bits in y, which will attempt to determine bits in y from // bits in x, etc. Regarding the mutual recursion, computeKnownBits can call // isKnownNonZero, which calls computeKnownBits and ComputeSignBit and // isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so on. typedef SmallPtrSet ExclInvsSet; namespace { // Simplifying using an assume can only be done in a particular control-flow // context (the context instruction provides that context). If an assume and // the context instruction are not in the same block then the DT helps in // figuring out if we can use it. struct Query { ExclInvsSet ExclInvs; const DataLayout &DL; AssumptionCache *AC; const Instruction *CxtI; const DominatorTree *DT; Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) : DL(DL), AC(AC), CxtI(CxtI), DT(DT) {} Query(const Query &Q, const Value *NewExcl) : ExclInvs(Q.ExclInvs), DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT) { ExclInvs.insert(NewExcl); } }; } // end anonymous namespace // Given the provided Value and, potentially, a context instruction, return // the preferred context instruction (if any). static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) { // If we've been provided with a context instruction, then use that (provided // it has been inserted). if (CxtI && CxtI->getParent()) return CxtI; // If the value is really an already-inserted instruction, then use that. CxtI = dyn_cast(V); if (CxtI && CxtI->getParent()) return CxtI; return nullptr; } static void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q); void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { ::computeKnownBits(V, KnownZero, KnownOne, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } bool llvm::haveNoCommonBitsSet(Value *LHS, Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { assert(LHS->getType() == RHS->getType() && "LHS and RHS should have the same type"); assert(LHS->getType()->isIntOrIntVectorTy() && "LHS and RHS should be integers"); IntegerType *IT = cast(LHS->getType()->getScalarType()); APInt LHSKnownZero(IT->getBitWidth(), 0), LHSKnownOne(IT->getBitWidth(), 0); APInt RHSKnownZero(IT->getBitWidth(), 0), RHSKnownOne(IT->getBitWidth(), 0); computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, 0, AC, CxtI, DT); computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, 0, AC, CxtI, DT); return (LHSKnownZero | RHSKnownZero).isAllOnesValue(); } static void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, unsigned Depth, const Query &Q); void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { ::ComputeSignBit(V, KnownZero, KnownOne, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } static bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth, const Query &Q); bool llvm::isKnownToBeAPowerOfTwo(Value *V, const DataLayout &DL, bool OrZero, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } static bool isKnownNonZero(Value *V, unsigned Depth, const Query &Q); bool llvm::isKnownNonZero(Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } bool llvm::isKnownNonNegative(Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { bool NonNegative, Negative; ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT); return NonNegative; } static bool isKnownNonEqual(Value *V1, Value *V2, const Query &Q); bool llvm::isKnownNonEqual(Value *V1, Value *V2, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::isKnownNonEqual(V1, V2, Query(DL, AC, safeCxtI(V1, safeCxtI(V2, CxtI)), DT)); } static bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth, const Query &Q); bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::MaskedValueIsZero(V, Mask, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } static unsigned ComputeNumSignBits(Value *V, unsigned Depth, const Query &Q); unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); } static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW, APInt &KnownZero, APInt &KnownOne, APInt &KnownZero2, APInt &KnownOne2, unsigned Depth, const Query &Q) { if (!Add) { if (ConstantInt *CLHS = dyn_cast(Op0)) { // We know that the top bits of C-X are clear if X contains less bits // than C (i.e. no wrap-around can happen). For example, 20-X is // positive if we can prove that X is >= 0 and < 16. if (!CLHS->getValue().isNegative()) { unsigned BitWidth = KnownZero.getBitWidth(); unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros(); // NLZ can't be BitWidth with no sign bit APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1); computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q); // If all of the MaskV bits are known to be zero, then we know the // output top bits are zero, because we now know that the output is // from [0-C]. if ((KnownZero2 & MaskV) == MaskV) { unsigned NLZ2 = CLHS->getValue().countLeadingZeros(); // Top bits known zero. KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2); } } } } unsigned BitWidth = KnownZero.getBitWidth(); // If an initial sequence of bits in the result is not needed, the // corresponding bits in the operands are not needed. APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, Depth + 1, Q); computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q); // Carry in a 1 for a subtract, rather than a 0. APInt CarryIn(BitWidth, 0); if (!Add) { // Sum = LHS + ~RHS + 1 std::swap(KnownZero2, KnownOne2); CarryIn.setBit(0); } APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn; APInt PossibleSumOne = LHSKnownOne + KnownOne2 + CarryIn; // Compute known bits of the carry. APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnownZero ^ KnownZero2); APInt CarryKnownOne = PossibleSumOne ^ LHSKnownOne ^ KnownOne2; // Compute set of known bits (where all three relevant bits are known). APInt LHSKnown = LHSKnownZero | LHSKnownOne; APInt RHSKnown = KnownZero2 | KnownOne2; APInt CarryKnown = CarryKnownZero | CarryKnownOne; APInt Known = LHSKnown & RHSKnown & CarryKnown; assert((PossibleSumZero & Known) == (PossibleSumOne & Known) && "known bits of sum differ"); // Compute known bits of the result. KnownZero = ~PossibleSumOne & Known; KnownOne = PossibleSumOne & Known; // Are we still trying to solve for the sign bit? if (!Known.isNegative()) { if (NSW) { // Adding two non-negative numbers, or subtracting a negative number from // a non-negative one, can't wrap into negative. if (LHSKnownZero.isNegative() && KnownZero2.isNegative()) KnownZero |= APInt::getSignBit(BitWidth); // Adding two negative numbers, or subtracting a non-negative number from // a negative one, can't wrap into non-negative. else if (LHSKnownOne.isNegative() && KnownOne2.isNegative()) KnownOne |= APInt::getSignBit(BitWidth); } } } static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW, APInt &KnownZero, APInt &KnownOne, APInt &KnownZero2, APInt &KnownOne2, unsigned Depth, const Query &Q) { unsigned BitWidth = KnownZero.getBitWidth(); computeKnownBits(Op1, KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(Op0, KnownZero2, KnownOne2, Depth + 1, Q); bool isKnownNegative = false; bool isKnownNonNegative = false; // If the multiplication is known not to overflow, compute the sign bit. if (NSW) { if (Op0 == Op1) { // The product of a number with itself is non-negative. isKnownNonNegative = true; } else { bool isKnownNonNegativeOp1 = KnownZero.isNegative(); bool isKnownNonNegativeOp0 = KnownZero2.isNegative(); bool isKnownNegativeOp1 = KnownOne.isNegative(); bool isKnownNegativeOp0 = KnownOne2.isNegative(); // The product of two numbers with the same sign is non-negative. isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) || (isKnownNonNegativeOp1 && isKnownNonNegativeOp0); // The product of a negative number and a non-negative number is either // negative or zero. if (!isKnownNonNegative) isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 && isKnownNonZero(Op0, Depth, Q)) || (isKnownNegativeOp0 && isKnownNonNegativeOp1 && isKnownNonZero(Op1, Depth, Q)); } } // If low bits are zero in either operand, output low known-0 bits. // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. KnownOne.clearAllBits(); unsigned TrailZ = KnownZero.countTrailingOnes() + KnownZero2.countTrailingOnes(); unsigned LeadZ = std::max(KnownZero.countLeadingOnes() + KnownZero2.countLeadingOnes(), BitWidth) - BitWidth; TrailZ = std::min(TrailZ, BitWidth); LeadZ = std::min(LeadZ, BitWidth); KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) | APInt::getHighBitsSet(BitWidth, LeadZ); // Only make use of no-wrap flags if we failed to compute the sign bit // directly. This matters if the multiplication always overflows, in // which case we prefer to follow the result of the direct computation, // though as the program is invoking undefined behaviour we can choose // whatever we like here. if (isKnownNonNegative && !KnownOne.isNegative()) KnownZero.setBit(BitWidth - 1); else if (isKnownNegative && !KnownZero.isNegative()) KnownOne.setBit(BitWidth - 1); } void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges, APInt &KnownZero, APInt &KnownOne) { unsigned BitWidth = KnownZero.getBitWidth(); unsigned NumRanges = Ranges.getNumOperands() / 2; assert(NumRanges >= 1); KnownZero.setAllBits(); KnownOne.setAllBits(); for (unsigned i = 0; i < NumRanges; ++i) { ConstantInt *Lower = mdconst::extract(Ranges.getOperand(2 * i + 0)); ConstantInt *Upper = mdconst::extract(Ranges.getOperand(2 * i + 1)); ConstantRange Range(Lower->getValue(), Upper->getValue()); // The first CommonPrefixBits of all values in Range are equal. unsigned CommonPrefixBits = (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros(); APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits); KnownOne &= Range.getUnsignedMax() & Mask; KnownZero &= ~Range.getUnsignedMax() & Mask; } } static bool isEphemeralValueOf(Instruction *I, const Value *E) { SmallVector WorkSet(1, I); SmallPtrSet Visited; SmallPtrSet EphValues; // The instruction defining an assumption's condition itself is always // considered ephemeral to that assumption (even if it has other // non-ephemeral users). See r246696's test case for an example. if (std::find(I->op_begin(), I->op_end(), E) != I->op_end()) return true; while (!WorkSet.empty()) { const Value *V = WorkSet.pop_back_val(); if (!Visited.insert(V).second) continue; // If all uses of this value are ephemeral, then so is this value. if (std::all_of(V->user_begin(), V->user_end(), [&](const User *U) { return EphValues.count(U); })) { if (V == E) return true; EphValues.insert(V); if (const User *U = dyn_cast(V)) for (User::const_op_iterator J = U->op_begin(), JE = U->op_end(); J != JE; ++J) { if (isSafeToSpeculativelyExecute(*J)) WorkSet.push_back(*J); } } } return false; } // Is this an intrinsic that cannot be speculated but also cannot trap? static bool isAssumeLikeIntrinsic(const Instruction *I) { if (const CallInst *CI = dyn_cast(I)) if (Function *F = CI->getCalledFunction()) switch (F->getIntrinsicID()) { default: break; // FIXME: This list is repeated from NoTTI::getIntrinsicCost. case Intrinsic::assume: case Intrinsic::dbg_declare: case Intrinsic::dbg_value: case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::objectsize: case Intrinsic::ptr_annotation: case Intrinsic::var_annotation: return true; } return false; } static bool isValidAssumeForContext(Value *V, const Instruction *CxtI, const DominatorTree *DT) { Instruction *Inv = cast(V); // There are two restrictions on the use of an assume: // 1. The assume must dominate the context (or the control flow must // reach the assume whenever it reaches the context). // 2. The context must not be in the assume's set of ephemeral values // (otherwise we will use the assume to prove that the condition // feeding the assume is trivially true, thus causing the removal of // the assume). if (DT) { if (DT->dominates(Inv, CxtI)) { return true; } else if (Inv->getParent() == CxtI->getParent()) { // The context comes first, but they're both in the same block. Make sure // there is nothing in between that might interrupt the control flow. for (BasicBlock::const_iterator I = std::next(BasicBlock::const_iterator(CxtI)), IE(Inv); I != IE; ++I) if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I)) return false; return !isEphemeralValueOf(Inv, CxtI); } return false; } // When we don't have a DT, we do a limited search... if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) { return true; } else if (Inv->getParent() == CxtI->getParent()) { // Search forward from the assume until we reach the context (or the end // of the block); the common case is that the assume will come first. for (BasicBlock::iterator I = std::next(BasicBlock::iterator(Inv)), IE = Inv->getParent()->end(); I != IE; ++I) if (&*I == CxtI) return true; // The context must come first... for (BasicBlock::const_iterator I = std::next(BasicBlock::const_iterator(CxtI)), IE(Inv); I != IE; ++I) if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I)) return false; return !isEphemeralValueOf(Inv, CxtI); } return false; } bool llvm::isValidAssumeForContext(const Instruction *I, const Instruction *CxtI, const DominatorTree *DT) { return ::isValidAssumeForContext(const_cast(I), CxtI, DT); } template inline match_combine_or, CmpClass_match> m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) { return m_CombineOr(m_ICmp(Pred, L, R), m_ICmp(Pred, R, L)); } template inline match_combine_or, BinaryOp_match> m_c_And(const LHS &L, const RHS &R) { return m_CombineOr(m_And(L, R), m_And(R, L)); } template inline match_combine_or, BinaryOp_match> m_c_Or(const LHS &L, const RHS &R) { return m_CombineOr(m_Or(L, R), m_Or(R, L)); } template inline match_combine_or, BinaryOp_match> m_c_Xor(const LHS &L, const RHS &R) { return m_CombineOr(m_Xor(L, R), m_Xor(R, L)); } /// Compute known bits in 'V' under the assumption that the condition 'Cmp' is /// true (at the context instruction.) This is mostly a utility function for /// the prototype dominating conditions reasoning below. static void computeKnownBitsFromTrueCondition(Value *V, ICmpInst *Cmp, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q) { Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); // TODO: We could potentially be more aggressive here. This would be worth // evaluating. If we can, explore commoning this code with the assume // handling logic. if (LHS != V && RHS != V) return; const unsigned BitWidth = KnownZero.getBitWidth(); switch (Cmp->getPredicate()) { default: // We know nothing from this condition break; // TODO: implement unsigned bound from below (known one bits) // TODO: common condition check implementations with assumes // TODO: implement other patterns from assume (e.g. V & B == A) case ICmpInst::ICMP_SGT: if (LHS == V) { APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0); computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, Depth + 1, Q); if (KnownOneTemp.isAllOnesValue() || KnownZeroTemp.isNegative()) { // We know that the sign bit is zero. KnownZero |= APInt::getSignBit(BitWidth); } } break; case ICmpInst::ICMP_EQ: { APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0); if (LHS == V) computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, Depth + 1, Q); else if (RHS == V) computeKnownBits(LHS, KnownZeroTemp, KnownOneTemp, Depth + 1, Q); else llvm_unreachable("missing use?"); KnownZero |= KnownZeroTemp; KnownOne |= KnownOneTemp; } break; case ICmpInst::ICMP_ULE: if (LHS == V) { APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0); computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, Depth + 1, Q); // The known zero bits carry over unsigned SignBits = KnownZeroTemp.countLeadingOnes(); KnownZero |= APInt::getHighBitsSet(BitWidth, SignBits); } break; case ICmpInst::ICMP_ULT: if (LHS == V) { APInt KnownZeroTemp(BitWidth, 0), KnownOneTemp(BitWidth, 0); computeKnownBits(RHS, KnownZeroTemp, KnownOneTemp, Depth + 1, Q); // Whatever high bits in rhs are zero are known to be zero (if rhs is a // power of 2, then one more). unsigned SignBits = KnownZeroTemp.countLeadingOnes(); if (isKnownToBeAPowerOfTwo(RHS, false, Depth + 1, Query(Q, Cmp))) SignBits++; KnownZero |= APInt::getHighBitsSet(BitWidth, SignBits); } break; }; } /// Compute known bits in 'V' from conditions which are known to be true along /// all paths leading to the context instruction. In particular, look for /// cases where one branch of an interesting condition dominates the context /// instruction. This does not do general dataflow. /// NOTE: This code is EXPERIMENTAL and currently off by default. static void computeKnownBitsFromDominatingCondition(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q) { // Need both the dominator tree and the query location to do anything useful if (!Q.DT || !Q.CxtI) return; Instruction *Cxt = const_cast(Q.CxtI); // The context instruction might be in a statically unreachable block. If // so, asking dominator queries may yield suprising results. (e.g. the block // may not have a dom tree node) if (!Q.DT->isReachableFromEntry(Cxt->getParent())) return; // Avoid useless work if (auto VI = dyn_cast(V)) if (VI->getParent() == Cxt->getParent()) return; // Note: We currently implement two options. It's not clear which of these // will survive long term, we need data for that. // Option 1 - Try walking the dominator tree looking for conditions which // might apply. This works well for local conditions (loop guards, etc..), // but not as well for things far from the context instruction (presuming a // low max blocks explored). If we can set an high enough limit, this would // be all we need. // Option 2 - We restrict out search to those conditions which are uses of // the value we're interested in. This is independent of dom structure, // but is slightly less powerful without looking through lots of use chains. // It does handle conditions far from the context instruction (e.g. early // function exits on entry) really well though. // Option 1 - Search the dom tree unsigned NumBlocksExplored = 0; BasicBlock *Current = Cxt->getParent(); while (true) { // Stop searching if we've gone too far up the chain if (NumBlocksExplored >= DomConditionsMaxDomBlocks) break; NumBlocksExplored++; if (!Q.DT->getNode(Current)->getIDom()) break; Current = Q.DT->getNode(Current)->getIDom()->getBlock(); if (!Current) // found function entry break; BranchInst *BI = dyn_cast(Current->getTerminator()); if (!BI || BI->isUnconditional()) continue; ICmpInst *Cmp = dyn_cast(BI->getCondition()); if (!Cmp) continue; // We're looking for conditions that are guaranteed to hold at the context // instruction. Finding a condition where one path dominates the context // isn't enough because both the true and false cases could merge before // the context instruction we're actually interested in. Instead, we need // to ensure that the taken *edge* dominates the context instruction. We // know that the edge must be reachable since we started from a reachable // block. BasicBlock *BB0 = BI->getSuccessor(0); BasicBlockEdge Edge(BI->getParent(), BB0); if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent())) continue; computeKnownBitsFromTrueCondition(V, Cmp, KnownZero, KnownOne, Depth, Q); } // Option 2 - Search the other uses of V unsigned NumUsesExplored = 0; for (auto U : V->users()) { // Avoid massive lists if (NumUsesExplored >= DomConditionsMaxUses) break; NumUsesExplored++; // Consider only compare instructions uniquely controlling a branch ICmpInst *Cmp = dyn_cast(U); if (!Cmp) continue; if (DomConditionsSingleCmpUse && !Cmp->hasOneUse()) continue; for (auto *CmpU : Cmp->users()) { BranchInst *BI = dyn_cast(CmpU); if (!BI || BI->isUnconditional()) continue; // We're looking for conditions that are guaranteed to hold at the // context instruction. Finding a condition where one path dominates // the context isn't enough because both the true and false cases could // merge before the context instruction we're actually interested in. // Instead, we need to ensure that the taken *edge* dominates the context // instruction. BasicBlock *BB0 = BI->getSuccessor(0); BasicBlockEdge Edge(BI->getParent(), BB0); if (!Edge.isSingleEdge() || !Q.DT->dominates(Edge, Q.CxtI->getParent())) continue; computeKnownBitsFromTrueCondition(V, Cmp, KnownZero, KnownOne, Depth, Q); } } } static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q) { // Use of assumptions is context-sensitive. If we don't have a context, we // cannot use them! if (!Q.AC || !Q.CxtI) return; unsigned BitWidth = KnownZero.getBitWidth(); for (auto &AssumeVH : Q.AC->assumptions()) { if (!AssumeVH) continue; CallInst *I = cast(AssumeVH); assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() && "Got assumption for the wrong function!"); if (Q.ExclInvs.count(I)) continue; // Warning: This loop can end up being somewhat performance sensetive. // We're running this loop for once for each value queried resulting in a // runtime of ~O(#assumes * #values). assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume && "must be an assume intrinsic"); Value *Arg = I->getArgOperand(0); if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { assert(BitWidth == 1 && "assume operand is not i1?"); KnownZero.clearAllBits(); KnownOne.setAllBits(); return; } // The remaining tests are all recursive, so bail out if we hit the limit. if (Depth == MaxDepth) continue; Value *A, *B; auto m_V = m_CombineOr(m_Specific(V), m_CombineOr(m_PtrToInt(m_Specific(V)), m_BitCast(m_Specific(V)))); CmpInst::Predicate Pred; ConstantInt *C; // assume(v = a) if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); KnownZero |= RHSKnownZero; KnownOne |= RHSKnownOne; // assume(v & b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0); computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I)); // For those bits in the mask that are known to be one, we can propagate // known bits from the RHS to V. KnownZero |= RHSKnownZero & MaskKnownOne; KnownOne |= RHSKnownOne & MaskKnownOne; // assume(~(v & b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0); computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I)); // For those bits in the mask that are known to be one, we can propagate // inverted known bits from the RHS to V. KnownZero |= RHSKnownOne & MaskKnownOne; KnownOne |= RHSKnownZero & MaskKnownOne; // assume(v | b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate known // bits from the RHS to V. KnownZero |= RHSKnownZero & BKnownZero; KnownOne |= RHSKnownOne & BKnownZero; // assume(~(v | b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate // inverted known bits from the RHS to V. KnownZero |= RHSKnownOne & BKnownZero; KnownOne |= RHSKnownZero & BKnownZero; // assume(v ^ b = a) } else if (match(Arg, m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate known // bits from the RHS to V. For those bits in B that are known to be one, // we can propagate inverted known bits from the RHS to V. KnownZero |= RHSKnownZero & BKnownZero; KnownOne |= RHSKnownOne & BKnownZero; KnownZero |= RHSKnownOne & BKnownOne; KnownOne |= RHSKnownZero & BKnownOne; // assume(~(v ^ b) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0); computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I)); // For those bits in B that are known to be zero, we can propagate // inverted known bits from the RHS to V. For those bits in B that are // known to be one, we can propagate known bits from the RHS to V. KnownZero |= RHSKnownOne & BKnownZero; KnownOne |= RHSKnownZero & BKnownZero; KnownZero |= RHSKnownZero & BKnownOne; KnownOne |= RHSKnownOne & BKnownOne; // assume(v << c = a) } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. KnownZero |= RHSKnownZero.lshr(C->getZExtValue()); KnownOne |= RHSKnownOne.lshr(C->getZExtValue()); // assume(~(v << c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. KnownZero |= RHSKnownOne.lshr(C->getZExtValue()); KnownOne |= RHSKnownZero.lshr(C->getZExtValue()); // assume(v >> c = a) } else if (match(Arg, m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)), m_AShr(m_V, m_ConstantInt(C))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them to known // bits in V shifted to the right by C. KnownZero |= RHSKnownZero << C->getZExtValue(); KnownOne |= RHSKnownOne << C->getZExtValue(); // assume(~(v >> c) = a) } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_CombineOr( m_LShr(m_V, m_ConstantInt(C)), m_AShr(m_V, m_ConstantInt(C)))), m_Value(A))) && Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // For those bits in RHS that are known, we can propagate them inverted // to known bits in V shifted to the right by C. KnownZero |= RHSKnownOne << C->getZExtValue(); KnownOne |= RHSKnownZero << C->getZExtValue(); // assume(v >=_s c) where c is non-negative } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SGE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); if (RHSKnownZero.isNegative()) { // We know that the sign bit is zero. KnownZero |= APInt::getSignBit(BitWidth); } // assume(v >_s c) where c is at least -1. } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SGT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) { // We know that the sign bit is zero. KnownZero |= APInt::getSignBit(BitWidth); } // assume(v <=_s c) where c is negative } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SLE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); if (RHSKnownOne.isNegative()) { // We know that the sign bit is one. KnownOne |= APInt::getSignBit(BitWidth); } // assume(v <_s c) where c is non-positive } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_SLT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) { // We know that the sign bit is one. KnownOne |= APInt::getSignBit(BitWidth); } // assume(v <=_u c) } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_ULE && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // Whatever high bits in c are zero are known to be zero. KnownZero |= APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()); // assume(v <_u c) } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_ULT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I)); // Whatever high bits in c are zero are known to be zero (if c is a power // of 2, then one more). if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I))) KnownZero |= APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1); else KnownZero |= APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()); } } } // Compute known bits from a shift operator, including those with a // non-constant shift amount. KnownZero and KnownOne are the outputs of this // function. KnownZero2 and KnownOne2 are pre-allocated temporaries with the // same bit width as KnownZero and KnownOne. KZF and KOF are operator-specific // functors that, given the known-zero or known-one bits respectively, and a // shift amount, compute the implied known-zero or known-one bits of the shift // operator's result respectively for that shift amount. The results from calling // KZF and KOF are conservatively combined for all permitted shift amounts. template static void computeKnownBitsFromShiftOperator(Operator *I, APInt &KnownZero, APInt &KnownOne, APInt &KnownZero2, APInt &KnownOne2, unsigned Depth, const Query &Q, KZFunctor KZF, KOFunctor KOF) { unsigned BitWidth = KnownZero.getBitWidth(); if (auto *SA = dyn_cast(I->getOperand(1))) { unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1); computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); KnownZero = KZF(KnownZero, ShiftAmt); KnownOne = KOF(KnownOne, ShiftAmt); return; } computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); // Note: We cannot use KnownZero.getLimitedValue() here, because if // BitWidth > 64 and any upper bits are known, we'll end up returning the // limit value (which implies all bits are known). uint64_t ShiftAmtKZ = KnownZero.zextOrTrunc(64).getZExtValue(); uint64_t ShiftAmtKO = KnownOne.zextOrTrunc(64).getZExtValue(); // It would be more-clearly correct to use the two temporaries for this // calculation. Reusing the APInts here to prevent unnecessary allocations. KnownZero.clearAllBits(), KnownOne.clearAllBits(); // If we know the shifter operand is nonzero, we can sometimes infer more // known bits. However this is expensive to compute, so be lazy about it and // only compute it when absolutely necessary. Optional ShifterOperandIsNonZero; // Early exit if we can't constrain any well-defined shift amount. if (!(ShiftAmtKZ & (BitWidth - 1)) && !(ShiftAmtKO & (BitWidth - 1))) { ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q); if (!*ShifterOperandIsNonZero) return; } computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) { // Combine the shifted known input bits only for those shift amounts // compatible with its known constraints. if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt) continue; if ((ShiftAmt | ShiftAmtKO) != ShiftAmt) continue; // If we know the shifter is nonzero, we may be able to infer more known // bits. This check is sunk down as far as possible to avoid the expensive // call to isKnownNonZero if the cheaper checks above fail. if (ShiftAmt == 0) { if (!ShifterOperandIsNonZero.hasValue()) ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q); if (*ShifterOperandIsNonZero) continue; } KnownZero &= KZF(KnownZero2, ShiftAmt); KnownOne &= KOF(KnownOne2, ShiftAmt); } // If there are no compatible shift amounts, then we've proven that the shift // amount must be >= the BitWidth, and the result is undefined. We could // return anything we'd like, but we need to make sure the sets of known bits // stay disjoint (it should be better for some other code to actually // propagate the undef than to pick a value here using known bits). if ((KnownZero & KnownOne) != 0) KnownZero.clearAllBits(), KnownOne.clearAllBits(); } static void computeKnownBitsFromOperator(Operator *I, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q) { unsigned BitWidth = KnownZero.getBitWidth(); APInt KnownZero2(KnownZero), KnownOne2(KnownOne); switch (I->getOpcode()) { default: break; case Instruction::Load: if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); break; case Instruction::And: { // If either the LHS or the RHS are Zero, the result is zero. computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); // Output known-1 bits are only known if set in both the LHS & RHS. KnownOne &= KnownOne2; // Output known-0 are known to be clear if zero in either the LHS | RHS. KnownZero |= KnownZero2; // and(x, add (x, -1)) is a common idiom that always clears the low bit; // here we handle the more general case of adding any odd number by // matching the form add(x, add(x, y)) where y is odd. // TODO: This could be generalized to clearing any bit set in y where the // following bit is known to be unset in y. Value *Y = nullptr; if (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)), m_Value(Y))) || match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)), m_Value(Y)))) { APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0); computeKnownBits(Y, KnownZero3, KnownOne3, Depth + 1, Q); if (KnownOne3.countTrailingOnes() > 0) KnownZero |= APInt::getLowBitsSet(BitWidth, 1); } break; } case Instruction::Or: { computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); // Output known-0 bits are only known if clear in both the LHS & RHS. KnownZero &= KnownZero2; // Output known-1 are known to be set if set in either the LHS | RHS. KnownOne |= KnownOne2; break; } case Instruction::Xor: { computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); // Output known-0 bits are known if clear or set in both the LHS & RHS. APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2); // Output known-1 are known to be set if set in only one of the LHS, RHS. KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2); KnownZero = KnownZeroOut; break; } case Instruction::Mul: { bool NSW = cast(I)->hasNoSignedWrap(); computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; } case Instruction::UDiv: { // For the purposes of computing leading zeros we can conservatively // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); unsigned LeadZ = KnownZero2.countLeadingOnes(); KnownOne2.clearAllBits(); KnownZero2.clearAllBits(); computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros(); if (RHSUnknownLeadingOnes != BitWidth) LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ); break; } case Instruction::Select: computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); // Only known if known in both the LHS and RHS. KnownOne &= KnownOne2; KnownZero &= KnownZero2; break; case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::SIToFP: case Instruction::UIToFP: break; // Can't work with floating point. case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::AddrSpaceCast: // Pointers could be different sizes. // FALL THROUGH and handle them the same as zext/trunc. case Instruction::ZExt: case Instruction::Trunc: { Type *SrcTy = I->getOperand(0)->getType(); unsigned SrcBitWidth; // Note that we handle pointer operands here because of inttoptr/ptrtoint // which fall through here. SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType()); assert(SrcBitWidth && "SrcBitWidth can't be zero"); KnownZero = KnownZero.zextOrTrunc(SrcBitWidth); KnownOne = KnownOne.zextOrTrunc(SrcBitWidth); computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); KnownZero = KnownZero.zextOrTrunc(BitWidth); KnownOne = KnownOne.zextOrTrunc(BitWidth); // Any top bits are known to be zero. if (BitWidth > SrcBitWidth) KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); break; } case Instruction::BitCast: { Type *SrcTy = I->getOperand(0)->getType(); if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy() || SrcTy->isFloatingPointTy()) && // TODO: For now, not handling conversions like: // (bitcast i64 %x to <2 x i32>) !I->getType()->isVectorTy()) { computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); break; } break; } case Instruction::SExt: { // Compute the bits in the result that are not present in the input. unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); KnownZero = KnownZero.trunc(SrcBitWidth); KnownOne = KnownOne.trunc(SrcBitWidth); computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); KnownZero = KnownZero.zext(BitWidth); KnownOne = KnownOne.zext(BitWidth); // If the sign bit of the input is known set or clear, then we know the // top bits of the result. if (KnownZero[SrcBitWidth-1]) // Input sign bit known zero KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); else if (KnownOne[SrcBitWidth-1]) // Input sign bit known set KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); break; } case Instruction::Shl: { // (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0 auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { return (KnownZero << ShiftAmt) | APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0. }; auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { return KnownOne << ShiftAmt; }; computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q, KZF, KOF); break; } case Instruction::LShr: { // (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { return APIntOps::lshr(KnownZero, ShiftAmt) | // High bits known zero. APInt::getHighBitsSet(BitWidth, ShiftAmt); }; auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { return APIntOps::lshr(KnownOne, ShiftAmt); }; computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q, KZF, KOF); break; } case Instruction::AShr: { // (ashr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0 auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) { return APIntOps::ashr(KnownZero, ShiftAmt); }; auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) { return APIntOps::ashr(KnownOne, ShiftAmt); }; computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q, KZF, KOF); break; } case Instruction::Sub: { bool NSW = cast(I)->hasNoSignedWrap(); computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; } case Instruction::Add: { bool NSW = cast(I)->hasNoSignedWrap(); computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; } case Instruction::SRem: if (ConstantInt *Rem = dyn_cast(I->getOperand(1))) { APInt RA = Rem->getValue().abs(); if (RA.isPowerOf2()) { APInt LowBits = RA - 1; computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); // The low bits of the first operand are unchanged by the srem. KnownZero = KnownZero2 & LowBits; KnownOne = KnownOne2 & LowBits; // If the first operand is non-negative or has all low bits zero, then // the upper bits are all zero. if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits)) KnownZero |= ~LowBits; // If the first operand is negative and not all low bits are zero, then // the upper bits are all one. if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0)) KnownOne |= ~LowBits; assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); } } // The sign bit is the LHS's sign bit, except when the result of the // remainder is zero. if (KnownZero.isNonNegative()) { APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1, Q); // If it's known zero, our sign bit is also zero. if (LHSKnownZero.isNegative()) KnownZero.setBit(BitWidth - 1); } break; case Instruction::URem: { if (ConstantInt *Rem = dyn_cast(I->getOperand(1))) { APInt RA = Rem->getValue(); if (RA.isPowerOf2()) { APInt LowBits = (RA - 1); computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); KnownZero |= ~LowBits; KnownOne &= LowBits; break; } } // Since the result is less than or equal to either operand, any leading // zero bits in either operand must also exist in the result. computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q); unsigned Leaders = std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes()); KnownOne.clearAllBits(); KnownZero = APInt::getHighBitsSet(BitWidth, Leaders); break; } case Instruction::Alloca: { AllocaInst *AI = cast(I); unsigned Align = AI->getAlignment(); if (Align == 0) - Align = Q.DL.getABITypeAlignment(AI->getType()->getElementType()); + Align = Q.DL.getABITypeAlignment(AI->getAllocatedType()); if (Align > 0) KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align)); break; } case Instruction::GetElementPtr: { // Analyze all of the subscripts of this getelementptr instruction // to determine if we can prove known low zero bits. APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0); computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, Depth + 1, Q); unsigned TrailZ = LocalKnownZero.countTrailingOnes(); gep_type_iterator GTI = gep_type_begin(I); for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) { Value *Index = I->getOperand(i); if (StructType *STy = dyn_cast(*GTI)) { // Handle struct member offset arithmetic. // Handle case when index is vector zeroinitializer Constant *CIndex = cast(Index); if (CIndex->isZeroValue()) continue; if (CIndex->getType()->isVectorTy()) Index = CIndex->getSplatValue(); unsigned Idx = cast(Index)->getZExtValue(); const StructLayout *SL = Q.DL.getStructLayout(STy); uint64_t Offset = SL->getElementOffset(Idx); TrailZ = std::min(TrailZ, countTrailingZeros(Offset)); } else { // Handle array index arithmetic. Type *IndexedTy = GTI.getIndexedType(); if (!IndexedTy->isSized()) { TrailZ = 0; break; } unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits(); uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy); LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0); computeKnownBits(Index, LocalKnownZero, LocalKnownOne, Depth + 1, Q); TrailZ = std::min(TrailZ, unsigned(countTrailingZeros(TypeSize) + LocalKnownZero.countTrailingOnes())); } } KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ); break; } case Instruction::PHI: { PHINode *P = cast(I); // Handle the case of a simple two-predecessor recurrence PHI. // There's a lot more that could theoretically be done here, but // this is sufficient to catch some interesting cases. if (P->getNumIncomingValues() == 2) { for (unsigned i = 0; i != 2; ++i) { Value *L = P->getIncomingValue(i); Value *R = P->getIncomingValue(!i); Operator *LU = dyn_cast(L); if (!LU) continue; unsigned Opcode = LU->getOpcode(); // Check for operations that have the property that if // both their operands have low zero bits, the result // will have low zero bits. if (Opcode == Instruction::Add || Opcode == Instruction::Sub || Opcode == Instruction::And || Opcode == Instruction::Or || Opcode == Instruction::Mul) { Value *LL = LU->getOperand(0); Value *LR = LU->getOperand(1); // Find a recurrence. if (LL == I) L = LR; else if (LR == I) L = LL; else break; // Ok, we have a PHI of the form L op= R. Check for low // zero bits. computeKnownBits(R, KnownZero2, KnownOne2, Depth + 1, Q); // We need to take the minimum number of known bits APInt KnownZero3(KnownZero), KnownOne3(KnownOne); computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q); KnownZero = APInt::getLowBitsSet(BitWidth, std::min(KnownZero2.countTrailingOnes(), KnownZero3.countTrailingOnes())); break; } } } // Unreachable blocks may have zero-operand PHI nodes. if (P->getNumIncomingValues() == 0) break; // Otherwise take the unions of the known bit sets of the operands, // taking conservative care to avoid excessive recursion. if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) { // Skip if every incoming value references to ourself. if (dyn_cast_or_null(P->hasConstantValue())) break; KnownZero = APInt::getAllOnesValue(BitWidth); KnownOne = APInt::getAllOnesValue(BitWidth); for (Value *IncValue : P->incoming_values()) { // Skip direct self references. if (IncValue == P) continue; KnownZero2 = APInt(BitWidth, 0); KnownOne2 = APInt(BitWidth, 0); // Recurse, but cap the recursion to one level, because we don't // want to waste time spinning around in loops. computeKnownBits(IncValue, KnownZero2, KnownOne2, MaxDepth - 1, Q); KnownZero &= KnownZero2; KnownOne &= KnownOne2; // If all bits have been ruled out, there's no need to check // more operands. if (!KnownZero && !KnownOne) break; } } break; } case Instruction::Call: case Instruction::Invoke: if (MDNode *MD = cast(I)->getMetadata(LLVMContext::MD_range)) computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne); // If a range metadata is attached to this IntrinsicInst, intersect the // explicit range specified by the metadata and the implicit range of // the intrinsic. if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::bswap: computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); KnownZero |= KnownZero2.byteSwap(); KnownOne |= KnownOne2.byteSwap(); break; case Intrinsic::ctlz: case Intrinsic::cttz: { unsigned LowBits = Log2_32(BitWidth)+1; // If this call is undefined for 0, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) LowBits -= 1; KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits); break; } case Intrinsic::ctpop: { computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q); // We can bound the space the count needs. Also, bits known to be zero // can't contribute to the population. unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation(); unsigned LeadingZeros = APInt(BitWidth, BitsPossiblySet).countLeadingZeros(); assert(LeadingZeros <= BitWidth); KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros); KnownOne &= ~KnownZero; // TODO: we could bound KnownOne using the lower bound on the number // of bits which might be set provided by popcnt KnownOne2. break; } case Intrinsic::fabs: { Type *Ty = II->getType(); APInt SignBit = APInt::getSignBit(Ty->getScalarSizeInBits()); KnownZero |= APInt::getSplat(Ty->getPrimitiveSizeInBits(), SignBit); break; } case Intrinsic::x86_sse42_crc32_64_64: KnownZero |= APInt::getHighBitsSet(64, 32); break; } } break; case Instruction::ExtractValue: if (IntrinsicInst *II = dyn_cast(I->getOperand(0))) { ExtractValueInst *EVI = cast(I); if (EVI->getNumIndices() != 1) break; if (EVI->getIndices()[0] == 0) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: computeKnownBitsAddSub(true, II->getArgOperand(0), II->getArgOperand(1), false, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: computeKnownBitsAddSub(false, II->getArgOperand(0), II->getArgOperand(1), false, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false, KnownZero, KnownOne, KnownZero2, KnownOne2, Depth, Q); break; } } } } } static unsigned getAlignment(const Value *V, const DataLayout &DL) { unsigned Align = 0; if (auto *GO = dyn_cast(V)) { Align = GO->getAlignment(); if (Align == 0) { if (auto *GVar = dyn_cast(GO)) { Type *ObjectType = GVar->getValueType(); if (ObjectType->isSized()) { // If the object is defined in the current Module, we'll be giving // it the preferred alignment. Otherwise, we have to assume that it // may only have the minimum ABI alignment. if (GVar->isStrongDefinitionForLinker()) Align = DL.getPreferredAlignment(GVar); else Align = DL.getABITypeAlignment(ObjectType); } } } } else if (const Argument *A = dyn_cast(V)) { Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0; if (!Align && A->hasStructRetAttr()) { // An sret parameter has at least the ABI alignment of the return type. Type *EltTy = cast(A->getType())->getElementType(); if (EltTy->isSized()) Align = DL.getABITypeAlignment(EltTy); } } else if (const AllocaInst *AI = dyn_cast(V)) Align = AI->getAlignment(); else if (auto CS = ImmutableCallSite(V)) Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex); else if (const LoadInst *LI = dyn_cast(V)) if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) { ConstantInt *CI = mdconst::extract(MD->getOperand(0)); Align = CI->getLimitedValue(); } return Align; } /// Determine which bits of V are known to be either zero or one and return /// them in the KnownZero/KnownOne bit sets. /// /// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that /// we cannot optimize based on the assumption that it is zero without changing /// it to be an explicit zero. If we don't change it to zero, other code could /// optimized based on the contradictory assumption that it is non-zero. /// Because instcombine aggressively folds operations with undef args anyway, /// this won't lose us code quality. /// /// This function is defined on values with integer type, values with pointer /// type, and vectors of integers. In the case /// where V is a vector, known zero, and known one values are the /// same width as the vector element, and the bit is set only if it is true /// for all of the elements in the vector. void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne, unsigned Depth, const Query &Q) { assert(V && "No Value?"); assert(Depth <= MaxDepth && "Limit Search Depth"); unsigned BitWidth = KnownZero.getBitWidth(); assert((V->getType()->isIntOrIntVectorTy() || V->getType()->isFPOrFPVectorTy() || V->getType()->getScalarType()->isPointerTy()) && "Not integer, floating point, or pointer type!"); assert((Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) && (!V->getType()->isIntOrIntVectorTy() || V->getType()->getScalarSizeInBits() == BitWidth) && KnownZero.getBitWidth() == BitWidth && KnownOne.getBitWidth() == BitWidth && "V, KnownOne and KnownZero should have same BitWidth"); if (ConstantInt *CI = dyn_cast(V)) { // We know all of the bits for a constant! KnownOne = CI->getValue(); KnownZero = ~KnownOne; return; } // Null and aggregate-zero are all-zeros. if (isa(V) || isa(V)) { KnownOne.clearAllBits(); KnownZero = APInt::getAllOnesValue(BitWidth); return; } // Handle a constant vector by taking the intersection of the known bits of // each element. There is no real need to handle ConstantVector here, because // we don't handle undef in any particularly useful way. if (ConstantDataSequential *CDS = dyn_cast(V)) { // We know that CDS must be a vector of integers. Take the intersection of // each element. KnownZero.setAllBits(); KnownOne.setAllBits(); APInt Elt(KnownZero.getBitWidth(), 0); for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) { Elt = CDS->getElementAsInteger(i); KnownZero &= ~Elt; KnownOne &= Elt; } return; } // Start out not knowing anything. KnownZero.clearAllBits(); KnownOne.clearAllBits(); // Limit search depth. // All recursive calls that increase depth must come after this. if (Depth == MaxDepth) return; // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has // the bits of its aliasee. if (GlobalAlias *GA = dyn_cast(V)) { if (!GA->mayBeOverridden()) computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, Depth + 1, Q); return; } if (Operator *I = dyn_cast(V)) computeKnownBitsFromOperator(I, KnownZero, KnownOne, Depth, Q); // Aligned pointers have trailing zeros - refine KnownZero set if (V->getType()->isPointerTy()) { unsigned Align = getAlignment(V, Q.DL); if (Align) KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align)); } // computeKnownBitsFromAssume and computeKnownBitsFromDominatingCondition // strictly refines KnownZero and KnownOne. Therefore, we run them after // computeKnownBitsFromOperator. // Check whether a nearby assume intrinsic can determine some known bits. computeKnownBitsFromAssume(V, KnownZero, KnownOne, Depth, Q); // Check whether there's a dominating condition which implies something about // this value at the given context. if (EnableDomConditions && Depth <= DomConditionsMaxDepth) computeKnownBitsFromDominatingCondition(V, KnownZero, KnownOne, Depth, Q); assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); } /// Determine whether the sign bit is known to be zero or one. /// Convenience wrapper around computeKnownBits. void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, unsigned Depth, const Query &Q) { unsigned BitWidth = getBitWidth(V->getType(), Q.DL); if (!BitWidth) { KnownZero = false; KnownOne = false; return; } APInt ZeroBits(BitWidth, 0); APInt OneBits(BitWidth, 0); computeKnownBits(V, ZeroBits, OneBits, Depth, Q); KnownOne = OneBits[BitWidth - 1]; KnownZero = ZeroBits[BitWidth - 1]; } /// Return true if the given value is known to have exactly one /// bit set when defined. For vectors return true if every element is known to /// be a power of two when defined. Supports values with integer or pointer /// types and vectors of integers. bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth, const Query &Q) { if (Constant *C = dyn_cast(V)) { if (C->isNullValue()) return OrZero; if (ConstantInt *CI = dyn_cast(C)) return CI->getValue().isPowerOf2(); // TODO: Handle vector constants. } // 1 << X is clearly a power of two if the one is not shifted off the end. If // it is shifted off the end then the result is undefined. if (match(V, m_Shl(m_One(), m_Value()))) return true; // (signbit) >>l X is clearly a power of two if the one is not shifted off the // bottom. If it is shifted off the bottom then the result is undefined. if (match(V, m_LShr(m_SignBit(), m_Value()))) return true; // The remaining tests are all recursive, so bail out if we hit the limit. if (Depth++ == MaxDepth) return false; Value *X = nullptr, *Y = nullptr; // A shift left or a logical shift right of a power of two is a power of two // or zero. if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) || match(V, m_LShr(m_Value(X), m_Value())))) return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q); if (ZExtInst *ZI = dyn_cast(V)) return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q); if (SelectInst *SI = dyn_cast(V)) return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) && isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q); if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) { // A power of two and'd with anything is a power of two or zero. if (isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q) || isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, Depth, Q)) return true; // X & (-X) is always a power of two or zero. if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X)))) return true; return false; } // Adding a power-of-two or zero to the same power-of-two or zero yields // either the original power-of-two, a larger power-of-two or zero. if (match(V, m_Add(m_Value(X), m_Value(Y)))) { OverflowingBinaryOperator *VOBO = cast(V); if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) { if (match(X, m_And(m_Specific(Y), m_Value())) || match(X, m_And(m_Value(), m_Specific(Y)))) if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q)) return true; if (match(Y, m_And(m_Specific(X), m_Value())) || match(Y, m_And(m_Value(), m_Specific(X)))) if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q)) return true; unsigned BitWidth = V->getType()->getScalarSizeInBits(); APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0); computeKnownBits(X, LHSZeroBits, LHSOneBits, Depth, Q); APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0); computeKnownBits(Y, RHSZeroBits, RHSOneBits, Depth, Q); // If i8 V is a power of two or zero: // ZeroBits: 1 1 1 0 1 1 1 1 // ~ZeroBits: 0 0 0 1 0 0 0 0 if ((~(LHSZeroBits & RHSZeroBits)).isPowerOf2()) // If OrZero isn't set, we cannot give back a zero result. // Make sure either the LHS or RHS has a bit set. if (OrZero || RHSOneBits.getBoolValue() || LHSOneBits.getBoolValue()) return true; } } // An exact divide or right shift can only shift off zero bits, so the result // is a power of two only if the first operand is a power of two and not // copying a sign bit (sdiv int_min, 2). if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) || match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) { return isKnownToBeAPowerOfTwo(cast(V)->getOperand(0), OrZero, Depth, Q); } return false; } /// \brief Test whether a GEP's result is known to be non-null. /// /// Uses properties inherent in a GEP to try to determine whether it is known /// to be non-null. /// /// Currently this routine does not support vector GEPs. static bool isGEPKnownNonNull(GEPOperator *GEP, unsigned Depth, const Query &Q) { if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0) return false; // FIXME: Support vector-GEPs. assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP"); // If the base pointer is non-null, we cannot walk to a null address with an // inbounds GEP in address space zero. if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q)) return true; // Walk the GEP operands and see if any operand introduces a non-zero offset. // If so, then the GEP cannot produce a null pointer, as doing so would // inherently violate the inbounds contract within address space zero. for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); GTI != GTE; ++GTI) { // Struct types are easy -- they must always be indexed by a constant. if (StructType *STy = dyn_cast(*GTI)) { ConstantInt *OpC = cast(GTI.getOperand()); unsigned ElementIdx = OpC->getZExtValue(); const StructLayout *SL = Q.DL.getStructLayout(STy); uint64_t ElementOffset = SL->getElementOffset(ElementIdx); if (ElementOffset > 0) return true; continue; } // If we have a zero-sized type, the index doesn't matter. Keep looping. if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0) continue; // Fast path the constant operand case both for efficiency and so we don't // increment Depth when just zipping down an all-constant GEP. if (ConstantInt *OpC = dyn_cast(GTI.getOperand())) { if (!OpC->isZero()) return true; continue; } // We post-increment Depth here because while isKnownNonZero increments it // as well, when we pop back up that increment won't persist. We don't want // to recurse 10k times just because we have 10k GEP operands. We don't // bail completely out because we want to handle constant GEPs regardless // of depth. if (Depth++ >= MaxDepth) continue; if (isKnownNonZero(GTI.getOperand(), Depth, Q)) return true; } return false; } /// Does the 'Range' metadata (which must be a valid MD_range operand list) /// ensure that the value it's attached to is never Value? 'RangeType' is /// is the type of the value described by the range. static bool rangeMetadataExcludesValue(MDNode* Ranges, const APInt& Value) { const unsigned NumRanges = Ranges->getNumOperands() / 2; assert(NumRanges >= 1); for (unsigned i = 0; i < NumRanges; ++i) { ConstantInt *Lower = mdconst::extract(Ranges->getOperand(2 * i + 0)); ConstantInt *Upper = mdconst::extract(Ranges->getOperand(2 * i + 1)); ConstantRange Range(Lower->getValue(), Upper->getValue()); if (Range.contains(Value)) return false; } return true; } /// Return true if the given value is known to be non-zero when defined. /// For vectors return true if every element is known to be non-zero when /// defined. Supports values with integer or pointer type and vectors of /// integers. bool isKnownNonZero(Value *V, unsigned Depth, const Query &Q) { if (Constant *C = dyn_cast(V)) { if (C->isNullValue()) return false; if (isa(C)) // Must be non-zero due to null test above. return true; // TODO: Handle vectors return false; } if (Instruction* I = dyn_cast(V)) { if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) { // If the possible ranges don't contain zero, then the value is // definitely non-zero. if (IntegerType* Ty = dyn_cast(V->getType())) { const APInt ZeroValue(Ty->getBitWidth(), 0); if (rangeMetadataExcludesValue(Ranges, ZeroValue)) return true; } } } // The remaining tests are all recursive, so bail out if we hit the limit. if (Depth++ >= MaxDepth) return false; // Check for pointer simplifications. if (V->getType()->isPointerTy()) { if (isKnownNonNull(V)) return true; if (GEPOperator *GEP = dyn_cast(V)) if (isGEPKnownNonNull(GEP, Depth, Q)) return true; } unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL); // X | Y != 0 if X != 0 or Y != 0. Value *X = nullptr, *Y = nullptr; if (match(V, m_Or(m_Value(X), m_Value(Y)))) return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q); // ext X != 0 if X != 0. if (isa(V) || isa(V)) return isKnownNonZero(cast(V)->getOperand(0), Depth, Q); // shl X, Y != 0 if X is odd. Note that the value of the shift is undefined // if the lowest bit is shifted off the end. if (BitWidth && match(V, m_Shl(m_Value(X), m_Value(Y)))) { // shl nuw can't remove any non-zero bits. OverflowingBinaryOperator *BO = cast(V); if (BO->hasNoUnsignedWrap()) return isKnownNonZero(X, Depth, Q); APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); computeKnownBits(X, KnownZero, KnownOne, Depth, Q); if (KnownOne[0]) return true; } // shr X, Y != 0 if X is negative. Note that the value of the shift is not // defined if the sign bit is shifted off the end. else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) { // shr exact can only shift out zero bits. PossiblyExactOperator *BO = cast(V); if (BO->isExact()) return isKnownNonZero(X, Depth, Q); bool XKnownNonNegative, XKnownNegative; ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q); if (XKnownNegative) return true; // If the shifter operand is a constant, and all of the bits shifted // out are known to be zero, and X is known non-zero then at least one // non-zero bit must remain. if (ConstantInt *Shift = dyn_cast(Y)) { APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); computeKnownBits(X, KnownZero, KnownOne, Depth, Q); auto ShiftVal = Shift->getLimitedValue(BitWidth - 1); // Is there a known one in the portion not shifted out? if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal) return true; // Are all the bits to be shifted out known zero? if (KnownZero.countTrailingOnes() >= ShiftVal) return isKnownNonZero(X, Depth, Q); } } // div exact can only produce a zero if the dividend is zero. else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) { return isKnownNonZero(X, Depth, Q); } // X + Y. else if (match(V, m_Add(m_Value(X), m_Value(Y)))) { bool XKnownNonNegative, XKnownNegative; bool YKnownNonNegative, YKnownNegative; ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q); ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q); // If X and Y are both non-negative (as signed values) then their sum is not // zero unless both X and Y are zero. if (XKnownNonNegative && YKnownNonNegative) if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q)) return true; // If X and Y are both negative (as signed values) then their sum is not // zero unless both X and Y equal INT_MIN. if (BitWidth && XKnownNegative && YKnownNegative) { APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); APInt Mask = APInt::getSignedMaxValue(BitWidth); // The sign bit of X is set. If some other bit is set then X is not equal // to INT_MIN. computeKnownBits(X, KnownZero, KnownOne, Depth, Q); if ((KnownOne & Mask) != 0) return true; // The sign bit of Y is set. If some other bit is set then Y is not equal // to INT_MIN. computeKnownBits(Y, KnownZero, KnownOne, Depth, Q); if ((KnownOne & Mask) != 0) return true; } // The sum of a non-negative number and a power of two is not zero. if (XKnownNonNegative && isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q)) return true; if (YKnownNonNegative && isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q)) return true; } // X * Y. else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) { OverflowingBinaryOperator *BO = cast(V); // If X and Y are non-zero then so is X * Y as long as the multiplication // does not overflow. if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) && isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q)) return true; } // (C ? X : Y) != 0 if X != 0 and Y != 0. else if (SelectInst *SI = dyn_cast(V)) { if (isKnownNonZero(SI->getTrueValue(), Depth, Q) && isKnownNonZero(SI->getFalseValue(), Depth, Q)) return true; } // PHI else if (PHINode *PN = dyn_cast(V)) { // Try and detect a recurrence that monotonically increases from a // starting value, as these are common as induction variables. if (PN->getNumIncomingValues() == 2) { Value *Start = PN->getIncomingValue(0); Value *Induction = PN->getIncomingValue(1); if (isa(Induction) && !isa(Start)) std::swap(Start, Induction); if (ConstantInt *C = dyn_cast(Start)) { if (!C->isZero() && !C->isNegative()) { ConstantInt *X; if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) || match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) && !X->isNegative()) return true; } } } } if (!BitWidth) return false; APInt KnownZero(BitWidth, 0); APInt KnownOne(BitWidth, 0); computeKnownBits(V, KnownZero, KnownOne, Depth, Q); return KnownOne != 0; } /// Return true if V2 == V1 + X, where X is known non-zero. static bool isAddOfNonZero(Value *V1, Value *V2, const Query &Q) { BinaryOperator *BO = dyn_cast(V1); if (!BO || BO->getOpcode() != Instruction::Add) return false; Value *Op = nullptr; if (V2 == BO->getOperand(0)) Op = BO->getOperand(1); else if (V2 == BO->getOperand(1)) Op = BO->getOperand(0); else return false; return isKnownNonZero(Op, 0, Q); } /// Return true if it is known that V1 != V2. static bool isKnownNonEqual(Value *V1, Value *V2, const Query &Q) { if (V1->getType()->isVectorTy() || V1 == V2) return false; if (V1->getType() != V2->getType()) // We can't look through casts yet. return false; if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q)) return true; if (IntegerType *Ty = dyn_cast(V1->getType())) { // Are any known bits in V1 contradictory to known bits in V2? If V1 // has a known zero where V2 has a known one, they must not be equal. auto BitWidth = Ty->getBitWidth(); APInt KnownZero1(BitWidth, 0); APInt KnownOne1(BitWidth, 0); computeKnownBits(V1, KnownZero1, KnownOne1, 0, Q); APInt KnownZero2(BitWidth, 0); APInt KnownOne2(BitWidth, 0); computeKnownBits(V2, KnownZero2, KnownOne2, 0, Q); auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1); if (OppositeBits.getBoolValue()) return true; } return false; } /// Return true if 'V & Mask' is known to be zero. We use this predicate to /// simplify operations downstream. Mask is known to be zero for bits that V /// cannot have. /// /// This function is defined on values with integer type, values with pointer /// type, and vectors of integers. In the case /// where V is a vector, the mask, known zero, and known one values are the /// same width as the vector element, and the bit is set only if it is true /// for all of the elements in the vector. bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth, const Query &Q) { APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0); computeKnownBits(V, KnownZero, KnownOne, Depth, Q); return (KnownZero & Mask) == Mask; } /// Return the number of times the sign bit of the register is replicated into /// the other bits. We know that at least 1 bit is always equal to the sign bit /// (itself), but other cases can give us information. For example, immediately /// after an "ashr X, 2", we know that the top 3 bits are all equal to each /// other, so we return 3. /// /// 'Op' must have a scalar integer type. /// unsigned ComputeNumSignBits(Value *V, unsigned Depth, const Query &Q) { unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType()); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; // Note that ConstantInt is handled by the general computeKnownBits case // below. if (Depth == 6) return 1; // Limit search depth. Operator *U = dyn_cast(V); switch (Operator::getOpcode(V)) { default: break; case Instruction::SExt: Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits(); return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp; case Instruction::SDiv: { const APInt *Denominator; // sdiv X, C -> adds log(C) sign bits. if (match(U->getOperand(1), m_APInt(Denominator))) { // Ignore non-positive denominator. if (!Denominator->isStrictlyPositive()) break; // Calculate the incoming numerator bits. unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); // Add floor(log(C)) bits to the numerator bits. return std::min(TyBits, NumBits + Denominator->logBase2()); } break; } case Instruction::SRem: { const APInt *Denominator; // srem X, C -> we know that the result is within [-C+1,C) when C is a // positive constant. This let us put a lower bound on the number of sign // bits. if (match(U->getOperand(1), m_APInt(Denominator))) { // Ignore non-positive denominator. if (!Denominator->isStrictlyPositive()) break; // Calculate the incoming numerator bits. SRem by a positive constant // can't lower the number of sign bits. unsigned NumrBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); // Calculate the leading sign bit constraints by examining the // denominator. Given that the denominator is positive, there are two // cases: // // 1. the numerator is positive. The result range is [0,C) and [0,C) u< // (1 << ceilLogBase2(C)). // // 2. the numerator is negative. Then the result range is (-C,0] and // integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)). // // Thus a lower bound on the number of sign bits is `TyBits - // ceilLogBase2(C)`. unsigned ResBits = TyBits - Denominator->ceilLogBase2(); return std::max(NumrBits, ResBits); } break; } case Instruction::AShr: { Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); // ashr X, C -> adds C sign bits. Vectors too. const APInt *ShAmt; if (match(U->getOperand(1), m_APInt(ShAmt))) { Tmp += ShAmt->getZExtValue(); if (Tmp > TyBits) Tmp = TyBits; } return Tmp; } case Instruction::Shl: { const APInt *ShAmt; if (match(U->getOperand(1), m_APInt(ShAmt))) { // shl destroys sign bits. Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); Tmp2 = ShAmt->getZExtValue(); if (Tmp2 >= TyBits || // Bad shift. Tmp2 >= Tmp) break; // Shifted all sign bits out. return Tmp - Tmp2; } break; } case Instruction::And: case Instruction::Or: case Instruction::Xor: // NOT is handled here. // Logical binary ops preserve the number of sign bits at the worst. Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); if (Tmp != 1) { Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); FirstAnswer = std::min(Tmp, Tmp2); // We computed what we know about the sign bits as our first // answer. Now proceed to the generic code that uses // computeKnownBits, and pick whichever answer is better. } break; case Instruction::Select: Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); if (Tmp == 1) return 1; // Early out. Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q); return std::min(Tmp, Tmp2); case Instruction::Add: // Add can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); if (Tmp == 1) return 1; // Early out. // Special case decrementing a value (ADD X, -1): if (const auto *CRHS = dyn_cast(U->getOperand(1))) if (CRHS->isAllOnesValue()) { APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); computeKnownBits(U->getOperand(0), KnownZero, KnownOne, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue()) return TyBits; // If we are subtracting one from a positive number, there is no carry // out of the result. if (KnownZero.isNegative()) return Tmp; } Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); if (Tmp2 == 1) return 1; return std::min(Tmp, Tmp2)-1; case Instruction::Sub: Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q); if (Tmp2 == 1) return 1; // Handle NEG. if (const auto *CLHS = dyn_cast(U->getOperand(0))) if (CLHS->isNullValue()) { APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); computeKnownBits(U->getOperand(1), KnownZero, KnownOne, Depth + 1, Q); // If the input is known to be 0 or 1, the output is 0/-1, which is all // sign bits set. if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue()) return TyBits; // If the input is known to be positive (the sign bit is known clear), // the output of the NEG has the same number of sign bits as the input. if (KnownZero.isNegative()) return Tmp2; // Otherwise, we treat this like a SUB. } // Sub can have at most one carry bit. Thus we know that the output // is, at worst, one more bit than the inputs. Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q); if (Tmp == 1) return 1; // Early out. return std::min(Tmp, Tmp2)-1; case Instruction::PHI: { PHINode *PN = cast(U); unsigned NumIncomingValues = PN->getNumIncomingValues(); // Don't analyze large in-degree PHIs. if (NumIncomingValues > 4) break; // Unreachable blocks may have zero-operand PHI nodes. if (NumIncomingValues == 0) break; // Take the minimum of all incoming values. This can't infinitely loop // because of our depth threshold. Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q); for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) { if (Tmp == 1) return Tmp; Tmp = std::min( Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q)); } return Tmp; } case Instruction::Trunc: // FIXME: it's tricky to do anything useful for this, but it is an important // case for targets like X86. break; } // Finally, if we can prove that the top bits of the result are 0's or 1's, // use this information. APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0); APInt Mask; computeKnownBits(V, KnownZero, KnownOne, Depth, Q); if (KnownZero.isNegative()) { // sign bit is 0 Mask = KnownZero; } else if (KnownOne.isNegative()) { // sign bit is 1; Mask = KnownOne; } else { // Nothing known. return FirstAnswer; } // Okay, we know that the sign bit in Mask is set. Use CLZ to determine // the number of identical bits in the top of the input value. Mask = ~Mask; Mask <<= Mask.getBitWidth()-TyBits; // Return # leading zeros. We use 'min' here in case Val was zero before // shifting. We don't want to return '64' as for an i32 "0". return std::max(FirstAnswer, std::min(TyBits, Mask.countLeadingZeros())); } /// This function computes the integer multiple of Base that equals V. /// If successful, it returns true and returns the multiple in /// Multiple. If unsuccessful, it returns false. It looks /// through SExt instructions only if LookThroughSExt is true. bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple, bool LookThroughSExt, unsigned Depth) { const unsigned MaxDepth = 6; assert(V && "No Value?"); assert(Depth <= MaxDepth && "Limit Search Depth"); assert(V->getType()->isIntegerTy() && "Not integer or pointer type!"); Type *T = V->getType(); ConstantInt *CI = dyn_cast(V); if (Base == 0) return false; if (Base == 1) { Multiple = V; return true; } ConstantExpr *CO = dyn_cast(V); Constant *BaseVal = ConstantInt::get(T, Base); if (CO && CO == BaseVal) { // Multiple is 1. Multiple = ConstantInt::get(T, 1); return true; } if (CI && CI->getZExtValue() % Base == 0) { Multiple = ConstantInt::get(T, CI->getZExtValue() / Base); return true; } if (Depth == MaxDepth) return false; // Limit search depth. Operator *I = dyn_cast(V); if (!I) return false; switch (I->getOpcode()) { default: break; case Instruction::SExt: if (!LookThroughSExt) return false; // otherwise fall through to ZExt case Instruction::ZExt: return ComputeMultiple(I->getOperand(0), Base, Multiple, LookThroughSExt, Depth+1); case Instruction::Shl: case Instruction::Mul: { Value *Op0 = I->getOperand(0); Value *Op1 = I->getOperand(1); if (I->getOpcode() == Instruction::Shl) { ConstantInt *Op1CI = dyn_cast(Op1); if (!Op1CI) return false; // Turn Op0 << Op1 into Op0 * 2^Op1 APInt Op1Int = Op1CI->getValue(); uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1); APInt API(Op1Int.getBitWidth(), 0); API.setBit(BitToSet); Op1 = ConstantInt::get(V->getContext(), API); } Value *Mul0 = nullptr; if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) { if (Constant *Op1C = dyn_cast(Op1)) if (Constant *MulC = dyn_cast(Mul0)) { if (Op1C->getType()->getPrimitiveSizeInBits() < MulC->getType()->getPrimitiveSizeInBits()) Op1C = ConstantExpr::getZExt(Op1C, MulC->getType()); if (Op1C->getType()->getPrimitiveSizeInBits() > MulC->getType()->getPrimitiveSizeInBits()) MulC = ConstantExpr::getZExt(MulC, Op1C->getType()); // V == Base * (Mul0 * Op1), so return (Mul0 * Op1) Multiple = ConstantExpr::getMul(MulC, Op1C); return true; } if (ConstantInt *Mul0CI = dyn_cast(Mul0)) if (Mul0CI->getValue() == 1) { // V == Base * Op1, so return Op1 Multiple = Op1; return true; } } Value *Mul1 = nullptr; if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) { if (Constant *Op0C = dyn_cast(Op0)) if (Constant *MulC = dyn_cast(Mul1)) { if (Op0C->getType()->getPrimitiveSizeInBits() < MulC->getType()->getPrimitiveSizeInBits()) Op0C = ConstantExpr::getZExt(Op0C, MulC->getType()); if (Op0C->getType()->getPrimitiveSizeInBits() > MulC->getType()->getPrimitiveSizeInBits()) MulC = ConstantExpr::getZExt(MulC, Op0C->getType()); // V == Base * (Mul1 * Op0), so return (Mul1 * Op0) Multiple = ConstantExpr::getMul(MulC, Op0C); return true; } if (ConstantInt *Mul1CI = dyn_cast(Mul1)) if (Mul1CI->getValue() == 1) { // V == Base * Op0, so return Op0 Multiple = Op0; return true; } } } } // We could not determine if V is a multiple of Base. return false; } /// Return true if we can prove that the specified FP value is never equal to /// -0.0. /// /// NOTE: this function will need to be revisited when we support non-default /// rounding modes! /// bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) { if (const ConstantFP *CFP = dyn_cast(V)) return !CFP->getValueAPF().isNegZero(); // FIXME: Magic number! At the least, this should be given a name because it's // used similarly in CannotBeOrderedLessThanZero(). A better fix may be to // expose it as a parameter, so it can be used for testing / experimenting. if (Depth == 6) return false; // Limit search depth. const Operator *I = dyn_cast(V); if (!I) return false; // Check if the nsz fast-math flag is set if (const FPMathOperator *FPO = dyn_cast(I)) if (FPO->hasNoSignedZeros()) return true; // (add x, 0.0) is guaranteed to return +0.0, not -0.0. if (I->getOpcode() == Instruction::FAdd) if (ConstantFP *CFP = dyn_cast(I->getOperand(1))) if (CFP->isNullValue()) return true; // sitofp and uitofp turn into +0.0 for zero. if (isa(I) || isa(I)) return true; if (const IntrinsicInst *II = dyn_cast(I)) // sqrt(-0.0) = -0.0, no other negative results are possible. if (II->getIntrinsicID() == Intrinsic::sqrt) return CannotBeNegativeZero(II->getArgOperand(0), Depth+1); if (const CallInst *CI = dyn_cast(I)) if (const Function *F = CI->getCalledFunction()) { if (F->isDeclaration()) { // abs(x) != -0.0 if (F->getName() == "abs") return true; // fabs[lf](x) != -0.0 if (F->getName() == "fabs") return true; if (F->getName() == "fabsf") return true; if (F->getName() == "fabsl") return true; if (F->getName() == "sqrt" || F->getName() == "sqrtf" || F->getName() == "sqrtl") return CannotBeNegativeZero(CI->getArgOperand(0), Depth+1); } } return false; } bool llvm::CannotBeOrderedLessThanZero(const Value *V, unsigned Depth) { if (const ConstantFP *CFP = dyn_cast(V)) return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero(); // FIXME: Magic number! At the least, this should be given a name because it's // used similarly in CannotBeNegativeZero(). A better fix may be to // expose it as a parameter, so it can be used for testing / experimenting. if (Depth == 6) return false; // Limit search depth. const Operator *I = dyn_cast(V); if (!I) return false; switch (I->getOpcode()) { default: break; // Unsigned integers are always nonnegative. case Instruction::UIToFP: return true; case Instruction::FMul: // x*x is always non-negative or a NaN. if (I->getOperand(0) == I->getOperand(1)) return true; // Fall through case Instruction::FAdd: case Instruction::FDiv: case Instruction::FRem: return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) && CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); case Instruction::Select: return CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1) && CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1); case Instruction::FPExt: case Instruction::FPTrunc: // Widening/narrowing never change sign. return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1); case Instruction::Call: if (const IntrinsicInst *II = dyn_cast(I)) switch (II->getIntrinsicID()) { default: break; case Intrinsic::maxnum: return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) || CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); case Intrinsic::minnum: return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1) && CannotBeOrderedLessThanZero(I->getOperand(1), Depth+1); case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::fabs: case Intrinsic::sqrt: return true; case Intrinsic::powi: if (ConstantInt *CI = dyn_cast(I->getOperand(1))) { // powi(x,n) is non-negative if n is even. if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0) return true; } return CannotBeOrderedLessThanZero(I->getOperand(0), Depth+1); case Intrinsic::fma: case Intrinsic::fmuladd: // x*x+y is non-negative if y is non-negative. return I->getOperand(0) == I->getOperand(1) && CannotBeOrderedLessThanZero(I->getOperand(2), Depth+1); } break; } return false; } /// If the specified value can be set by repeating the same byte in memory, /// return the i8 value that it is represented with. This is /// true for all i8 values obviously, but is also true for i32 0, i32 -1, /// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated /// byte store (e.g. i16 0x1234), return null. Value *llvm::isBytewiseValue(Value *V) { // All byte-wide stores are splatable, even of arbitrary variables. if (V->getType()->isIntegerTy(8)) return V; // Handle 'null' ConstantArrayZero etc. if (Constant *C = dyn_cast(V)) if (C->isNullValue()) return Constant::getNullValue(Type::getInt8Ty(V->getContext())); // Constant float and double values can be handled as integer values if the // corresponding integer value is "byteable". An important case is 0.0. if (ConstantFP *CFP = dyn_cast(V)) { if (CFP->getType()->isFloatTy()) V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext())); if (CFP->getType()->isDoubleTy()) V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext())); // Don't handle long double formats, which have strange constraints. } // We can handle constant integers that are multiple of 8 bits. if (ConstantInt *CI = dyn_cast(V)) { if (CI->getBitWidth() % 8 == 0) { assert(CI->getBitWidth() > 8 && "8 bits should be handled above!"); if (!CI->getValue().isSplat(8)) return nullptr; return ConstantInt::get(V->getContext(), CI->getValue().trunc(8)); } } // A ConstantDataArray/Vector is splatable if all its members are equal and // also splatable. if (ConstantDataSequential *CA = dyn_cast(V)) { Value *Elt = CA->getElementAsConstant(0); Value *Val = isBytewiseValue(Elt); if (!Val) return nullptr; for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I) if (CA->getElementAsConstant(I) != Elt) return nullptr; return Val; } // Conceptually, we could handle things like: // %a = zext i8 %X to i16 // %b = shl i16 %a, 8 // %c = or i16 %a, %b // but until there is an example that actually needs this, it doesn't seem // worth worrying about. return nullptr; } // This is the recursive version of BuildSubAggregate. It takes a few different // arguments. Idxs is the index within the nested struct From that we are // looking at now (which is of type IndexedType). IdxSkip is the number of // indices from Idxs that should be left out when inserting into the resulting // struct. To is the result struct built so far, new insertvalue instructions // build on that. static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType, SmallVectorImpl &Idxs, unsigned IdxSkip, Instruction *InsertBefore) { llvm::StructType *STy = dyn_cast(IndexedType); if (STy) { // Save the original To argument so we can modify it Value *OrigTo = To; // General case, the type indexed by Idxs is a struct for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { // Process each struct element recursively Idxs.push_back(i); Value *PrevTo = To; To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip, InsertBefore); Idxs.pop_back(); if (!To) { // Couldn't find any inserted value for this index? Cleanup while (PrevTo != OrigTo) { InsertValueInst* Del = cast(PrevTo); PrevTo = Del->getAggregateOperand(); Del->eraseFromParent(); } // Stop processing elements break; } } // If we successfully found a value for each of our subaggregates if (To) return To; } // Base case, the type indexed by SourceIdxs is not a struct, or not all of // the struct's elements had a value that was inserted directly. In the latter // case, perhaps we can't determine each of the subelements individually, but // we might be able to find the complete struct somewhere. // Find the value that is at that particular spot Value *V = FindInsertedValue(From, Idxs); if (!V) return nullptr; // Insert the value in the new (sub) aggregrate return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip), "tmp", InsertBefore); } // This helper takes a nested struct and extracts a part of it (which is again a // struct) into a new value. For example, given the struct: // { a, { b, { c, d }, e } } // and the indices "1, 1" this returns // { c, d }. // // It does this by inserting an insertvalue for each element in the resulting // struct, as opposed to just inserting a single struct. This will only work if // each of the elements of the substruct are known (ie, inserted into From by an // insertvalue instruction somewhere). // // All inserted insertvalue instructions are inserted before InsertBefore static Value *BuildSubAggregate(Value *From, ArrayRef idx_range, Instruction *InsertBefore) { assert(InsertBefore && "Must have someplace to insert!"); Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(), idx_range); Value *To = UndefValue::get(IndexedType); SmallVector Idxs(idx_range.begin(), idx_range.end()); unsigned IdxSkip = Idxs.size(); return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore); } /// Given an aggregrate and an sequence of indices, see if /// the scalar value indexed is already around as a register, for example if it /// were inserted directly into the aggregrate. /// /// If InsertBefore is not null, this function will duplicate (modified) /// insertvalues when a part of a nested struct is extracted. Value *llvm::FindInsertedValue(Value *V, ArrayRef idx_range, Instruction *InsertBefore) { // Nothing to index? Just return V then (this is useful at the end of our // recursion). if (idx_range.empty()) return V; // We have indices, so V should have an indexable type. assert((V->getType()->isStructTy() || V->getType()->isArrayTy()) && "Not looking at a struct or array?"); assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) && "Invalid indices for type?"); if (Constant *C = dyn_cast(V)) { C = C->getAggregateElement(idx_range[0]); if (!C) return nullptr; return FindInsertedValue(C, idx_range.slice(1), InsertBefore); } if (InsertValueInst *I = dyn_cast(V)) { // Loop the indices for the insertvalue instruction in parallel with the // requested indices const unsigned *req_idx = idx_range.begin(); for (const unsigned *i = I->idx_begin(), *e = I->idx_end(); i != e; ++i, ++req_idx) { if (req_idx == idx_range.end()) { // We can't handle this without inserting insertvalues if (!InsertBefore) return nullptr; // The requested index identifies a part of a nested aggregate. Handle // this specially. For example, // %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0 // %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1 // %C = extractvalue {i32, { i32, i32 } } %B, 1 // This can be changed into // %A = insertvalue {i32, i32 } undef, i32 10, 0 // %C = insertvalue {i32, i32 } %A, i32 11, 1 // which allows the unused 0,0 element from the nested struct to be // removed. return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx), InsertBefore); } // This insert value inserts something else than what we are looking for. // See if the (aggregate) value inserted into has the value we are // looking for, then. if (*req_idx != *i) return FindInsertedValue(I->getAggregateOperand(), idx_range, InsertBefore); } // If we end up here, the indices of the insertvalue match with those // requested (though possibly only partially). Now we recursively look at // the inserted value, passing any remaining indices. return FindInsertedValue(I->getInsertedValueOperand(), makeArrayRef(req_idx, idx_range.end()), InsertBefore); } if (ExtractValueInst *I = dyn_cast(V)) { // If we're extracting a value from an aggregate that was extracted from // something else, we can extract from that something else directly instead. // However, we will need to chain I's indices with the requested indices. // Calculate the number of indices required unsigned size = I->getNumIndices() + idx_range.size(); // Allocate some space to put the new indices in SmallVector Idxs; Idxs.reserve(size); // Add indices from the extract value instruction Idxs.append(I->idx_begin(), I->idx_end()); // Add requested indices Idxs.append(idx_range.begin(), idx_range.end()); assert(Idxs.size() == size && "Number of indices added not correct?"); return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore); } // Otherwise, we don't know (such as, extracting from a function return value // or load instruction) return nullptr; } /// Analyze the specified pointer to see if it can be expressed as a base /// pointer plus a constant offset. Return the base and offset to the caller. Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL) { unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType()); APInt ByteOffset(BitWidth, 0); // We walk up the defs but use a visited set to handle unreachable code. In // that case, we stop after accumulating the cycle once (not that it // matters). SmallPtrSet Visited; while (Visited.insert(Ptr).second) { if (Ptr->getType()->isVectorTy()) break; if (GEPOperator *GEP = dyn_cast(Ptr)) { APInt GEPOffset(BitWidth, 0); if (!GEP->accumulateConstantOffset(DL, GEPOffset)) break; ByteOffset += GEPOffset; Ptr = GEP->getPointerOperand(); } else if (Operator::getOpcode(Ptr) == Instruction::BitCast || Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) { Ptr = cast(Ptr)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(Ptr)) { if (GA->mayBeOverridden()) break; Ptr = GA->getAliasee(); } else { break; } } Offset = ByteOffset.getSExtValue(); return Ptr; } /// This function computes the length of a null-terminated C string pointed to /// by V. If successful, it returns true and returns the string in Str. /// If unsuccessful, it returns false. bool llvm::getConstantStringInfo(const Value *V, StringRef &Str, uint64_t Offset, bool TrimAtNul) { assert(V); // Look through bitcast instructions and geps. V = V->stripPointerCasts(); // If the value is a GEP instruction or constant expression, treat it as an // offset. if (const GEPOperator *GEP = dyn_cast(V)) { // Make sure the GEP has exactly three arguments. if (GEP->getNumOperands() != 3) return false; // Make sure the index-ee is a pointer to array of i8. PointerType *PT = cast(GEP->getOperand(0)->getType()); ArrayType *AT = dyn_cast(PT->getElementType()); if (!AT || !AT->getElementType()->isIntegerTy(8)) return false; // Check to make sure that the first operand of the GEP is an integer and // has value 0 so that we are sure we're indexing into the initializer. const ConstantInt *FirstIdx = dyn_cast(GEP->getOperand(1)); if (!FirstIdx || !FirstIdx->isZero()) return false; // If the second index isn't a ConstantInt, then this is a variable index // into the array. If this occurs, we can't say anything meaningful about // the string. uint64_t StartIdx = 0; if (const ConstantInt *CI = dyn_cast(GEP->getOperand(2))) StartIdx = CI->getZExtValue(); else return false; return getConstantStringInfo(GEP->getOperand(0), Str, StartIdx + Offset, TrimAtNul); } // The GEP instruction, constant or instruction, must reference a global // variable that is a constant and is initialized. The referenced constant // initializer is the array that we'll use for optimization. const GlobalVariable *GV = dyn_cast(V); if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer()) return false; // Handle the all-zeros case if (GV->getInitializer()->isNullValue()) { // This is a degenerate case. The initializer is constant zero so the // length of the string must be zero. Str = ""; return true; } // Must be a Constant Array const ConstantDataArray *Array = dyn_cast(GV->getInitializer()); if (!Array || !Array->isString()) return false; // Get the number of elements in the array uint64_t NumElts = Array->getType()->getArrayNumElements(); // Start out with the entire array in the StringRef. Str = Array->getAsString(); if (Offset > NumElts) return false; // Skip over 'offset' bytes. Str = Str.substr(Offset); if (TrimAtNul) { // Trim off the \0 and anything after it. If the array is not nul // terminated, we just return the whole end of string. The client may know // some other way that the string is length-bound. Str = Str.substr(0, Str.find('\0')); } return true; } // These next two are very similar to the above, but also look through PHI // nodes. // TODO: See if we can integrate these two together. /// If we can compute the length of the string pointed to by /// the specified pointer, return 'len+1'. If we can't, return 0. static uint64_t GetStringLengthH(Value *V, SmallPtrSetImpl &PHIs) { // Look through noop bitcast instructions. V = V->stripPointerCasts(); // If this is a PHI node, there are two cases: either we have already seen it // or we haven't. if (PHINode *PN = dyn_cast(V)) { if (!PHIs.insert(PN).second) return ~0ULL; // already in the set. // If it was new, see if all the input strings are the same length. uint64_t LenSoFar = ~0ULL; for (Value *IncValue : PN->incoming_values()) { uint64_t Len = GetStringLengthH(IncValue, PHIs); if (Len == 0) return 0; // Unknown length -> unknown. if (Len == ~0ULL) continue; if (Len != LenSoFar && LenSoFar != ~0ULL) return 0; // Disagree -> unknown. LenSoFar = Len; } // Success, all agree. return LenSoFar; } // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y) if (SelectInst *SI = dyn_cast(V)) { uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs); if (Len1 == 0) return 0; uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs); if (Len2 == 0) return 0; if (Len1 == ~0ULL) return Len2; if (Len2 == ~0ULL) return Len1; if (Len1 != Len2) return 0; return Len1; } // Otherwise, see if we can read the string. StringRef StrData; if (!getConstantStringInfo(V, StrData)) return 0; return StrData.size()+1; } /// If we can compute the length of the string pointed to by /// the specified pointer, return 'len+1'. If we can't, return 0. uint64_t llvm::GetStringLength(Value *V) { if (!V->getType()->isPointerTy()) return 0; SmallPtrSet PHIs; uint64_t Len = GetStringLengthH(V, PHIs); // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return // an empty string as a length. return Len == ~0ULL ? 1 : Len; } /// \brief \p PN defines a loop-variant pointer to an object. Check if the /// previous iteration of the loop was referring to the same object as \p PN. static bool isSameUnderlyingObjectInLoop(PHINode *PN, LoopInfo *LI) { // Find the loop-defined value. Loop *L = LI->getLoopFor(PN->getParent()); if (PN->getNumIncomingValues() != 2) return true; // Find the value from previous iteration. auto *PrevValue = dyn_cast(PN->getIncomingValue(0)); if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L) PrevValue = dyn_cast(PN->getIncomingValue(1)); if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L) return true; // If a new pointer is loaded in the loop, the pointer references a different // object in every iteration. E.g.: // for (i) // int *p = a[i]; // ... if (auto *Load = dyn_cast(PrevValue)) if (!L->isLoopInvariant(Load->getPointerOperand())) return false; return true; } Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup) { if (!V->getType()->isPointerTy()) return V; for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) { if (GEPOperator *GEP = dyn_cast(V)) { V = GEP->getPointerOperand(); } else if (Operator::getOpcode(V) == Instruction::BitCast || Operator::getOpcode(V) == Instruction::AddrSpaceCast) { V = cast(V)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(V)) { if (GA->mayBeOverridden()) return V; V = GA->getAliasee(); } else { // See if InstructionSimplify knows any relevant tricks. if (Instruction *I = dyn_cast(V)) // TODO: Acquire a DominatorTree and AssumptionCache and use them. if (Value *Simplified = SimplifyInstruction(I, DL, nullptr)) { V = Simplified; continue; } return V; } assert(V->getType()->isPointerTy() && "Unexpected operand type!"); } return V; } void llvm::GetUnderlyingObjects(Value *V, SmallVectorImpl &Objects, const DataLayout &DL, LoopInfo *LI, unsigned MaxLookup) { SmallPtrSet Visited; SmallVector Worklist; Worklist.push_back(V); do { Value *P = Worklist.pop_back_val(); P = GetUnderlyingObject(P, DL, MaxLookup); if (!Visited.insert(P).second) continue; if (SelectInst *SI = dyn_cast(P)) { Worklist.push_back(SI->getTrueValue()); Worklist.push_back(SI->getFalseValue()); continue; } if (PHINode *PN = dyn_cast(P)) { // If this PHI changes the underlying object in every iteration of the // loop, don't look through it. Consider: // int **A; // for (i) { // Prev = Curr; // Prev = PHI (Prev_0, Curr) // Curr = A[i]; // *Prev, *Curr; // // Prev is tracking Curr one iteration behind so they refer to different // underlying objects. if (!LI || !LI->isLoopHeader(PN->getParent()) || isSameUnderlyingObjectInLoop(PN, LI)) for (Value *IncValue : PN->incoming_values()) Worklist.push_back(IncValue); continue; } Objects.push_back(P); } while (!Worklist.empty()); } /// Return true if the only users of this pointer are lifetime markers. bool llvm::onlyUsedByLifetimeMarkers(const Value *V) { for (const User *U : V->users()) { const IntrinsicInst *II = dyn_cast(U); if (!II) return false; if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) return false; } return true; } static bool isDereferenceableFromAttribute(const Value *BV, APInt Offset, Type *Ty, const DataLayout &DL, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { assert(Offset.isNonNegative() && "offset can't be negative"); assert(Ty->isSized() && "must be sized"); APInt DerefBytes(Offset.getBitWidth(), 0); bool CheckForNonNull = false; if (const Argument *A = dyn_cast(BV)) { DerefBytes = A->getDereferenceableBytes(); if (!DerefBytes.getBoolValue()) { DerefBytes = A->getDereferenceableOrNullBytes(); CheckForNonNull = true; } } else if (auto CS = ImmutableCallSite(BV)) { DerefBytes = CS.getDereferenceableBytes(0); if (!DerefBytes.getBoolValue()) { DerefBytes = CS.getDereferenceableOrNullBytes(0); CheckForNonNull = true; } } else if (const LoadInst *LI = dyn_cast(BV)) { if (MDNode *MD = LI->getMetadata(LLVMContext::MD_dereferenceable)) { ConstantInt *CI = mdconst::extract(MD->getOperand(0)); DerefBytes = CI->getLimitedValue(); } if (!DerefBytes.getBoolValue()) { if (MDNode *MD = LI->getMetadata(LLVMContext::MD_dereferenceable_or_null)) { ConstantInt *CI = mdconst::extract(MD->getOperand(0)); DerefBytes = CI->getLimitedValue(); } CheckForNonNull = true; } } if (DerefBytes.getBoolValue()) if (DerefBytes.uge(Offset + DL.getTypeStoreSize(Ty))) if (!CheckForNonNull || isKnownNonNullAt(BV, CtxI, DT, TLI)) return true; return false; } static bool isDereferenceableFromAttribute(const Value *V, const DataLayout &DL, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { Type *VTy = V->getType(); Type *Ty = VTy->getPointerElementType(); if (!Ty->isSized()) return false; APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0); return isDereferenceableFromAttribute(V, Offset, Ty, DL, CtxI, DT, TLI); } static bool isAligned(const Value *Base, APInt Offset, unsigned Align, const DataLayout &DL) { APInt BaseAlign(Offset.getBitWidth(), getAlignment(Base, DL)); if (!BaseAlign) { Type *Ty = Base->getType()->getPointerElementType(); if (!Ty->isSized()) return false; BaseAlign = DL.getABITypeAlignment(Ty); } APInt Alignment(Offset.getBitWidth(), Align); assert(Alignment.isPowerOf2() && "must be a power of 2!"); return BaseAlign.uge(Alignment) && !(Offset & (Alignment-1)); } static bool isAligned(const Value *Base, unsigned Align, const DataLayout &DL) { Type *Ty = Base->getType(); assert(Ty->isSized() && "must be sized"); APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0); return isAligned(Base, Offset, Align, DL); } /// Test if V is always a pointer to allocated and suitably aligned memory for /// a simple load or store. static bool isDereferenceableAndAlignedPointer( const Value *V, unsigned Align, const DataLayout &DL, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI, SmallPtrSetImpl &Visited) { // Note that it is not safe to speculate into a malloc'd region because // malloc may return null. // These are obviously ok if aligned. if (isa(V)) return isAligned(V, Align, DL); // It's not always safe to follow a bitcast, for example: // bitcast i8* (alloca i8) to i32* // would result in a 4-byte load from a 1-byte alloca. However, // if we're casting from a pointer from a type of larger size // to a type of smaller size (or the same size), and the alignment // is at least as large as for the resulting pointer type, then // we can look through the bitcast. if (const BitCastOperator *BC = dyn_cast(V)) { Type *STy = BC->getSrcTy()->getPointerElementType(), *DTy = BC->getDestTy()->getPointerElementType(); if (STy->isSized() && DTy->isSized() && (DL.getTypeStoreSize(STy) >= DL.getTypeStoreSize(DTy)) && (DL.getABITypeAlignment(STy) >= DL.getABITypeAlignment(DTy))) return isDereferenceableAndAlignedPointer(BC->getOperand(0), Align, DL, CtxI, DT, TLI, Visited); } // Global variables which can't collapse to null are ok. if (const GlobalVariable *GV = dyn_cast(V)) if (!GV->hasExternalWeakLinkage()) return isAligned(V, Align, DL); // byval arguments are okay. if (const Argument *A = dyn_cast(V)) if (A->hasByValAttr()) return isAligned(V, Align, DL); if (isDereferenceableFromAttribute(V, DL, CtxI, DT, TLI)) return isAligned(V, Align, DL); // For GEPs, determine if the indexing lands within the allocated object. if (const GEPOperator *GEP = dyn_cast(V)) { Type *VTy = GEP->getType(); Type *Ty = VTy->getPointerElementType(); const Value *Base = GEP->getPointerOperand(); // Conservatively require that the base pointer be fully dereferenceable // and aligned. if (!Visited.insert(Base).second) return false; if (!isDereferenceableAndAlignedPointer(Base, Align, DL, CtxI, DT, TLI, Visited)) return false; APInt Offset(DL.getPointerTypeSizeInBits(VTy), 0); if (!GEP->accumulateConstantOffset(DL, Offset)) return false; // Check if the load is within the bounds of the underlying object // and offset is aligned. uint64_t LoadSize = DL.getTypeStoreSize(Ty); Type *BaseType = Base->getType()->getPointerElementType(); assert(isPowerOf2_32(Align) && "must be a power of 2!"); return (Offset + LoadSize).ule(DL.getTypeAllocSize(BaseType)) && !(Offset & APInt(Offset.getBitWidth(), Align-1)); } // For gc.relocate, look through relocations if (const GCRelocateInst *RelocateInst = dyn_cast(V)) return isDereferenceableAndAlignedPointer( RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited); if (const AddrSpaceCastInst *ASC = dyn_cast(V)) return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL, CtxI, DT, TLI, Visited); // If we don't know, assume the worst. return false; } bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align, const DataLayout &DL, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { // When dereferenceability information is provided by a dereferenceable // attribute, we know exactly how many bytes are dereferenceable. If we can // determine the exact offset to the attributed variable, we can use that // information here. Type *VTy = V->getType(); Type *Ty = VTy->getPointerElementType(); // Require ABI alignment for loads without alignment specification if (Align == 0) Align = DL.getABITypeAlignment(Ty); if (Ty->isSized()) { APInt Offset(DL.getTypeStoreSizeInBits(VTy), 0); const Value *BV = V->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); if (Offset.isNonNegative()) if (isDereferenceableFromAttribute(BV, Offset, Ty, DL, CtxI, DT, TLI) && isAligned(BV, Offset, Align, DL)) return true; } SmallPtrSet Visited; return ::isDereferenceableAndAlignedPointer(V, Align, DL, CtxI, DT, TLI, Visited); } bool llvm::isDereferenceablePointer(const Value *V, const DataLayout &DL, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { return isDereferenceableAndAlignedPointer(V, 1, DL, CtxI, DT, TLI); } bool llvm::isSafeToSpeculativelyExecute(const Value *V, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { const Operator *Inst = dyn_cast(V); if (!Inst) return false; for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) if (Constant *C = dyn_cast(Inst->getOperand(i))) if (C->canTrap()) return false; switch (Inst->getOpcode()) { default: return true; case Instruction::UDiv: case Instruction::URem: { // x / y is undefined if y == 0. const APInt *V; if (match(Inst->getOperand(1), m_APInt(V))) return *V != 0; return false; } case Instruction::SDiv: case Instruction::SRem: { // x / y is undefined if y == 0 or x == INT_MIN and y == -1 const APInt *Numerator, *Denominator; if (!match(Inst->getOperand(1), m_APInt(Denominator))) return false; // We cannot hoist this division if the denominator is 0. if (*Denominator == 0) return false; // It's safe to hoist if the denominator is not 0 or -1. if (*Denominator != -1) return true; // At this point we know that the denominator is -1. It is safe to hoist as // long we know that the numerator is not INT_MIN. if (match(Inst->getOperand(0), m_APInt(Numerator))) return !Numerator->isMinSignedValue(); // The numerator *might* be MinSignedValue. return false; } case Instruction::Load: { const LoadInst *LI = cast(Inst); if (!LI->isUnordered() || // Speculative load may create a race that did not exist in the source. LI->getParent()->getParent()->hasFnAttribute( Attribute::SanitizeThread) || // Speculative load may load data from dirty regions. LI->getParent()->getParent()->hasFnAttribute( Attribute::SanitizeAddress)) return false; const DataLayout &DL = LI->getModule()->getDataLayout(); return isDereferenceableAndAlignedPointer( LI->getPointerOperand(), LI->getAlignment(), DL, CtxI, DT, TLI); } case Instruction::Call: { if (const IntrinsicInst *II = dyn_cast(Inst)) { switch (II->getIntrinsicID()) { // These synthetic intrinsics have no side-effects and just mark // information about their operands. // FIXME: There are other no-op synthetic instructions that potentially // should be considered at least *safe* to speculate... case Intrinsic::dbg_declare: case Intrinsic::dbg_value: return true; case Intrinsic::bswap: case Intrinsic::ctlz: case Intrinsic::ctpop: case Intrinsic::cttz: case Intrinsic::objectsize: case Intrinsic::sadd_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::umul_with_overflow: case Intrinsic::usub_with_overflow: return true; // Sqrt should be OK, since the llvm sqrt intrinsic isn't defined to set // errno like libm sqrt would. case Intrinsic::sqrt: case Intrinsic::fma: case Intrinsic::fmuladd: case Intrinsic::fabs: case Intrinsic::minnum: case Intrinsic::maxnum: return true; // TODO: some fp intrinsics are marked as having the same error handling // as libm. They're safe to speculate when they won't error. // TODO: are convert_{from,to}_fp16 safe? // TODO: can we list target-specific intrinsics here? default: break; } } return false; // The called function could have undefined behavior or // side-effects, even if marked readnone nounwind. } case Instruction::VAArg: case Instruction::Alloca: case Instruction::Invoke: case Instruction::PHI: case Instruction::Store: case Instruction::Ret: case Instruction::Br: case Instruction::IndirectBr: case Instruction::Switch: case Instruction::Unreachable: case Instruction::Fence: case Instruction::AtomicRMW: case Instruction::AtomicCmpXchg: case Instruction::LandingPad: case Instruction::Resume: case Instruction::CatchSwitch: case Instruction::CatchPad: case Instruction::CatchRet: case Instruction::CleanupPad: case Instruction::CleanupRet: return false; // Misc instructions which have effects } } bool llvm::mayBeMemoryDependent(const Instruction &I) { return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I); } /// Return true if we know that the specified value is never null. bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) { assert(V->getType()->isPointerTy() && "V must be pointer type"); // Alloca never returns null, malloc might. if (isa(V)) return true; // A byval, inalloca, or nonnull argument is never null. if (const Argument *A = dyn_cast(V)) return A->hasByValOrInAllocaAttr() || A->hasNonNullAttr(); // A global variable in address space 0 is non null unless extern weak. // Other address spaces may have null as a valid address for a global, // so we can't assume anything. if (const GlobalValue *GV = dyn_cast(V)) return !GV->hasExternalWeakLinkage() && GV->getType()->getAddressSpace() == 0; // A Load tagged w/nonnull metadata is never null. if (const LoadInst *LI = dyn_cast(V)) return LI->getMetadata(LLVMContext::MD_nonnull); if (auto CS = ImmutableCallSite(V)) if (CS.isReturnNonNull()) return true; return false; } static bool isKnownNonNullFromDominatingCondition(const Value *V, const Instruction *CtxI, const DominatorTree *DT) { assert(V->getType()->isPointerTy() && "V must be pointer type"); unsigned NumUsesExplored = 0; for (auto U : V->users()) { // Avoid massive lists if (NumUsesExplored >= DomConditionsMaxUses) break; NumUsesExplored++; // Consider only compare instructions uniquely controlling a branch const ICmpInst *Cmp = dyn_cast(U); if (!Cmp) continue; if (DomConditionsSingleCmpUse && !Cmp->hasOneUse()) continue; for (auto *CmpU : Cmp->users()) { const BranchInst *BI = dyn_cast(CmpU); if (!BI) continue; assert(BI->isConditional() && "uses a comparison!"); BasicBlock *NonNullSuccessor = nullptr; CmpInst::Predicate Pred; if (match(const_cast(Cmp), m_c_ICmp(Pred, m_Specific(V), m_Zero()))) { if (Pred == ICmpInst::ICMP_EQ) NonNullSuccessor = BI->getSuccessor(1); else if (Pred == ICmpInst::ICMP_NE) NonNullSuccessor = BI->getSuccessor(0); } if (NonNullSuccessor) { BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor); if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent())) return true; } } } return false; } bool llvm::isKnownNonNullAt(const Value *V, const Instruction *CtxI, const DominatorTree *DT, const TargetLibraryInfo *TLI) { if (isKnownNonNull(V, TLI)) return true; return CtxI ? ::isKnownNonNullFromDominatingCondition(V, CtxI, DT) : false; } OverflowResult llvm::computeOverflowForUnsignedMul(Value *LHS, Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { // Multiplying n * m significant bits yields a result of n + m significant // bits. If the total number of significant bits does not exceed the // result bit width (minus 1), there is no overflow. // This means if we have enough leading zero bits in the operands // we can guarantee that the result does not overflow. // Ref: "Hacker's Delight" by Henry Warren unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); APInt LHSKnownZero(BitWidth, 0); APInt LHSKnownOne(BitWidth, 0); APInt RHSKnownZero(BitWidth, 0); APInt RHSKnownOne(BitWidth, 0); computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, /*Depth=*/0, AC, CxtI, DT); computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, /*Depth=*/0, AC, CxtI, DT); // Note that underestimating the number of zero bits gives a more // conservative answer. unsigned ZeroBits = LHSKnownZero.countLeadingOnes() + RHSKnownZero.countLeadingOnes(); // First handle the easy case: if we have enough zero bits there's // definitely no overflow. if (ZeroBits >= BitWidth) return OverflowResult::NeverOverflows; // Get the largest possible values for each operand. APInt LHSMax = ~LHSKnownZero; APInt RHSMax = ~RHSKnownZero; // We know the multiply operation doesn't overflow if the maximum values for // each operand will not overflow after we multiply them together. bool MaxOverflow; LHSMax.umul_ov(RHSMax, MaxOverflow); if (!MaxOverflow) return OverflowResult::NeverOverflows; // We know it always overflows if multiplying the smallest possible values for // the operands also results in overflow. bool MinOverflow; LHSKnownOne.umul_ov(RHSKnownOne, MinOverflow); if (MinOverflow) return OverflowResult::AlwaysOverflows; return OverflowResult::MayOverflow; } OverflowResult llvm::computeOverflowForUnsignedAdd(Value *LHS, Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { bool LHSKnownNonNegative, LHSKnownNegative; ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0, AC, CxtI, DT); if (LHSKnownNonNegative || LHSKnownNegative) { bool RHSKnownNonNegative, RHSKnownNegative; ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0, AC, CxtI, DT); if (LHSKnownNegative && RHSKnownNegative) { // The sign bit is set in both cases: this MUST overflow. // Create a simple add instruction, and insert it into the struct. return OverflowResult::AlwaysOverflows; } if (LHSKnownNonNegative && RHSKnownNonNegative) { // The sign bit is clear in both cases: this CANNOT overflow. // Create a simple add instruction, and insert it into the struct. return OverflowResult::NeverOverflows; } } return OverflowResult::MayOverflow; } static OverflowResult computeOverflowForSignedAdd( Value *LHS, Value *RHS, AddOperator *Add, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { if (Add && Add->hasNoSignedWrap()) { return OverflowResult::NeverOverflows; } bool LHSKnownNonNegative, LHSKnownNegative; bool RHSKnownNonNegative, RHSKnownNegative; ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0, AC, CxtI, DT); ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0, AC, CxtI, DT); if ((LHSKnownNonNegative && RHSKnownNegative) || (LHSKnownNegative && RHSKnownNonNegative)) { // The sign bits are opposite: this CANNOT overflow. return OverflowResult::NeverOverflows; } // The remaining code needs Add to be available. Early returns if not so. if (!Add) return OverflowResult::MayOverflow; // If the sign of Add is the same as at least one of the operands, this add // CANNOT overflow. This is particularly useful when the sum is // @llvm.assume'ed non-negative rather than proved so from analyzing its // operands. bool LHSOrRHSKnownNonNegative = (LHSKnownNonNegative || RHSKnownNonNegative); bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative); if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) { bool AddKnownNonNegative, AddKnownNegative; ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL, /*Depth=*/0, AC, CxtI, DT); if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) || (AddKnownNegative && LHSOrRHSKnownNegative)) { return OverflowResult::NeverOverflows; } } return OverflowResult::MayOverflow; } OverflowResult llvm::computeOverflowForSignedAdd(AddOperator *Add, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1), Add, DL, AC, CxtI, DT); } OverflowResult llvm::computeOverflowForSignedAdd(Value *LHS, Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT); } bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) { // FIXME: This conservative implementation can be relaxed. E.g. most // atomic operations are guaranteed to terminate on most platforms // and most functions terminate. return !I->isAtomic() && // atomics may never succeed on some platforms !isa(I) && // could throw and might not terminate !isa(I) && // might not terminate and could throw to // non-successor (see bug 24185 for details). !isa(I) && // has no successors !isa(I); // has no successors } bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I, const Loop *L) { // The loop header is guaranteed to be executed for every iteration. // // FIXME: Relax this constraint to cover all basic blocks that are // guaranteed to be executed at every iteration. if (I->getParent() != L->getHeader()) return false; for (const Instruction &LI : *L->getHeader()) { if (&LI == I) return true; if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false; } llvm_unreachable("Instruction not contained in its own parent basic block."); } bool llvm::propagatesFullPoison(const Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Xor: case Instruction::Trunc: case Instruction::BitCast: case Instruction::AddrSpaceCast: // These operations all propagate poison unconditionally. Note that poison // is not any particular value, so xor or subtraction of poison with // itself still yields poison, not zero. return true; case Instruction::AShr: case Instruction::SExt: // For these operations, one bit of the input is replicated across // multiple output bits. A replicated poison bit is still poison. return true; case Instruction::Shl: { // Left shift *by* a poison value is poison. The number of // positions to shift is unsigned, so no negative values are // possible there. Left shift by zero places preserves poison. So // it only remains to consider left shift of poison by a positive // number of places. // // A left shift by a positive number of places leaves the lowest order bit // non-poisoned. However, if such a shift has a no-wrap flag, then we can // make the poison operand violate that flag, yielding a fresh full-poison // value. auto *OBO = cast(I); return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap(); } case Instruction::Mul: { // A multiplication by zero yields a non-poison zero result, so we need to // rule out zero as an operand. Conservatively, multiplication by a // non-zero constant is not multiplication by zero. // // Multiplication by a non-zero constant can leave some bits // non-poisoned. For example, a multiplication by 2 leaves the lowest // order bit unpoisoned. So we need to consider that. // // Multiplication by 1 preserves poison. If the multiplication has a // no-wrap flag, then we can make the poison operand violate that flag // when multiplied by any integer other than 0 and 1. auto *OBO = cast(I); if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) { for (Value *V : OBO->operands()) { if (auto *CI = dyn_cast(V)) { // A ConstantInt cannot yield poison, so we can assume that it is // the other operand that is poison. return !CI->isZero(); } } } return false; } case Instruction::GetElementPtr: // A GEP implicitly represents a sequence of additions, subtractions, // truncations, sign extensions and multiplications. The multiplications // are by the non-zero sizes of some set of types, so we do not have to be // concerned with multiplication by zero. If the GEP is in-bounds, then // these operations are implicitly no-signed-wrap so poison is propagated // by the arguments above for Add, Sub, Trunc, SExt and Mul. return cast(I)->isInBounds(); default: return false; } } const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) { switch (I->getOpcode()) { case Instruction::Store: return cast(I)->getPointerOperand(); case Instruction::Load: return cast(I)->getPointerOperand(); case Instruction::AtomicCmpXchg: return cast(I)->getPointerOperand(); case Instruction::AtomicRMW: return cast(I)->getPointerOperand(); case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: return I->getOperand(1); default: return nullptr; } } bool llvm::isKnownNotFullPoison(const Instruction *PoisonI) { // We currently only look for uses of poison values within the same basic // block, as that makes it easier to guarantee that the uses will be // executed given that PoisonI is executed. // // FIXME: Expand this to consider uses beyond the same basic block. To do // this, look out for the distinction between post-dominance and strong // post-dominance. const BasicBlock *BB = PoisonI->getParent(); // Set of instructions that we have proved will yield poison if PoisonI // does. SmallSet YieldsPoison; YieldsPoison.insert(PoisonI); for (BasicBlock::const_iterator I = PoisonI->getIterator(), E = BB->end(); I != E; ++I) { if (&*I != PoisonI) { const Value *NotPoison = getGuaranteedNonFullPoisonOp(&*I); if (NotPoison != nullptr && YieldsPoison.count(NotPoison)) return true; if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) return false; } // Mark poison that propagates from I through uses of I. if (YieldsPoison.count(&*I)) { for (const User *User : I->users()) { const Instruction *UserI = cast(User); if (UserI->getParent() == BB && propagatesFullPoison(UserI)) YieldsPoison.insert(User); } } } return false; } static bool isKnownNonNaN(Value *V, FastMathFlags FMF) { if (FMF.noNaNs()) return true; if (auto *C = dyn_cast(V)) return !C->isNaN(); return false; } static bool isKnownNonZero(Value *V) { if (auto *C = dyn_cast(V)) return !C->isZero(); return false; } static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred, FastMathFlags FMF, Value *CmpLHS, Value *CmpRHS, Value *TrueVal, Value *FalseVal, Value *&LHS, Value *&RHS) { LHS = CmpLHS; RHS = CmpRHS; // If the predicate is an "or-equal" (FP) predicate, then signed zeroes may // return inconsistent results between implementations. // (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0 // minNum(0.0, -0.0) // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1) // Therefore we behave conservatively and only proceed if at least one of the // operands is known to not be zero, or if we don't care about signed zeroes. switch (Pred) { default: break; case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE: case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE: if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) && !isKnownNonZero(CmpRHS)) return {SPF_UNKNOWN, SPNB_NA, false}; } SelectPatternNaNBehavior NaNBehavior = SPNB_NA; bool Ordered = false; // When given one NaN and one non-NaN input: // - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input. // - A simple C99 (a < b ? a : b) construction will return 'b' (as the // ordered comparison fails), which could be NaN or non-NaN. // so here we discover exactly what NaN behavior is required/accepted. if (CmpInst::isFPPredicate(Pred)) { bool LHSSafe = isKnownNonNaN(CmpLHS, FMF); bool RHSSafe = isKnownNonNaN(CmpRHS, FMF); if (LHSSafe && RHSSafe) { // Both operands are known non-NaN. NaNBehavior = SPNB_RETURNS_ANY; } else if (CmpInst::isOrdered(Pred)) { // An ordered comparison will return false when given a NaN, so it // returns the RHS. Ordered = true; if (LHSSafe) // LHS is non-NaN, so if RHS is NaN then NaN will be returned. NaNBehavior = SPNB_RETURNS_NAN; else if (RHSSafe) NaNBehavior = SPNB_RETURNS_OTHER; else // Completely unsafe. return {SPF_UNKNOWN, SPNB_NA, false}; } else { Ordered = false; // An unordered comparison will return true when given a NaN, so it // returns the LHS. if (LHSSafe) // LHS is non-NaN, so if RHS is NaN then non-NaN will be returned. NaNBehavior = SPNB_RETURNS_OTHER; else if (RHSSafe) NaNBehavior = SPNB_RETURNS_NAN; else // Completely unsafe. return {SPF_UNKNOWN, SPNB_NA, false}; } } if (TrueVal == CmpRHS && FalseVal == CmpLHS) { std::swap(CmpLHS, CmpRHS); Pred = CmpInst::getSwappedPredicate(Pred); if (NaNBehavior == SPNB_RETURNS_NAN) NaNBehavior = SPNB_RETURNS_OTHER; else if (NaNBehavior == SPNB_RETURNS_OTHER) NaNBehavior = SPNB_RETURNS_NAN; Ordered = !Ordered; } // ([if]cmp X, Y) ? X : Y if (TrueVal == CmpLHS && FalseVal == CmpRHS) { switch (Pred) { default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality. case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false}; case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false}; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false}; case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false}; case FCmpInst::FCMP_UGT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_OGT: case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered}; case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_ULE: case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered}; } } if (ConstantInt *C1 = dyn_cast(CmpRHS)) { if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) || (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) { // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X if (Pred == ICmpInst::ICMP_SGT && (C1->isZero() || C1->isMinusOne())) { return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false}; } // ABS(X) ==> (X (X isZero() || C1->isOne())) { return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false}; } } // Y >s C ? ~Y : ~C == ~Y (FalseVal)) { if (C1->getType() == C2->getType() && ~C1->getValue() == C2->getValue() && (match(TrueVal, m_Not(m_Specific(CmpLHS))) || match(CmpLHS, m_Not(m_Specific(TrueVal))))) { LHS = TrueVal; RHS = FalseVal; return {SPF_SMIN, SPNB_NA, false}; } } } // TODO: (X > 4) ? X : 5 --> (X >= 5) ? X : 5 --> MAX(X, 5) return {SPF_UNKNOWN, SPNB_NA, false}; } static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2, Instruction::CastOps *CastOp) { CastInst *CI = dyn_cast(V1); Constant *C = dyn_cast(V2); CastInst *CI2 = dyn_cast(V2); if (!CI) return nullptr; *CastOp = CI->getOpcode(); if (CI2) { // If V1 and V2 are both the same cast from the same type, we can look // through V1. if (CI2->getOpcode() == CI->getOpcode() && CI2->getSrcTy() == CI->getSrcTy()) return CI2->getOperand(0); return nullptr; } else if (!C) { return nullptr; } if (isa(CI) && CmpI->isSigned()) { Constant *T = ConstantExpr::getTrunc(C, CI->getSrcTy()); // This is only valid if the truncated value can be sign-extended // back to the original value. if (ConstantExpr::getSExt(T, C->getType()) == C) return T; return nullptr; } if (isa(CI) && CmpI->isUnsigned()) return ConstantExpr::getTrunc(C, CI->getSrcTy()); if (isa(CI)) return ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned()); if (isa(CI)) return ConstantExpr::getUIToFP(C, CI->getSrcTy(), true); if (isa(CI)) return ConstantExpr::getSIToFP(C, CI->getSrcTy(), true); if (isa(CI)) return ConstantExpr::getFPToUI(C, CI->getSrcTy(), true); if (isa(CI)) return ConstantExpr::getFPToSI(C, CI->getSrcTy(), true); if (isa(CI)) return ConstantExpr::getFPExtend(C, CI->getSrcTy(), true); if (isa(CI)) return ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true); return nullptr; } SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp) { SelectInst *SI = dyn_cast(V); if (!SI) return {SPF_UNKNOWN, SPNB_NA, false}; CmpInst *CmpI = dyn_cast(SI->getCondition()); if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false}; CmpInst::Predicate Pred = CmpI->getPredicate(); Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); Value *TrueVal = SI->getTrueValue(); Value *FalseVal = SI->getFalseValue(); FastMathFlags FMF; if (isa(CmpI)) FMF = CmpI->getFastMathFlags(); // Bail out early. if (CmpI->isEquality()) return {SPF_UNKNOWN, SPNB_NA, false}; // Deal with type mismatches. if (CastOp && CmpLHS->getType() != TrueVal->getType()) { if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp)) return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, cast(TrueVal)->getOperand(0), C, LHS, RHS); if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, C, cast(FalseVal)->getOperand(0), LHS, RHS); } return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS); } ConstantRange llvm::getConstantRangeFromMetadata(MDNode &Ranges) { const unsigned NumRanges = Ranges.getNumOperands() / 2; assert(NumRanges >= 1 && "Must have at least one range!"); assert(Ranges.getNumOperands() % 2 == 0 && "Must be a sequence of pairs"); auto *FirstLow = mdconst::extract(Ranges.getOperand(0)); auto *FirstHigh = mdconst::extract(Ranges.getOperand(1)); ConstantRange CR(FirstLow->getValue(), FirstHigh->getValue()); for (unsigned i = 1; i < NumRanges; ++i) { auto *Low = mdconst::extract(Ranges.getOperand(2 * i + 0)); auto *High = mdconst::extract(Ranges.getOperand(2 * i + 1)); // Note: unionWith will potentially create a range that contains values not // contained in any of the original N ranges. CR = CR.unionWith(ConstantRange(Low->getValue(), High->getValue())); } return CR; } /// Return true if "icmp Pred LHS RHS" is always true. static bool isTruePredicate(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!"); if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS) return true; switch (Pred) { default: return false; case CmpInst::ICMP_SLE: { const APInt *C; // LHS s<= LHS +_{nsw} C if C >= 0 if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C)))) return !C->isNegative(); return false; } case CmpInst::ICMP_ULE: { const APInt *C; // LHS u<= LHS +_{nuw} C for any C if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C)))) return true; // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB) auto MatchNUWAddsToSameValue = [&](Value *A, Value *B, Value *&X, const APInt *&CA, const APInt *&CB) { if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) && match(B, m_NUWAdd(m_Specific(X), m_APInt(CB)))) return true; // If X & C == 0 then (X | C) == X +_{nuw} C if (match(A, m_Or(m_Value(X), m_APInt(CA))) && match(B, m_Or(m_Specific(X), m_APInt(CB)))) { unsigned BitWidth = CA->getBitWidth(); APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); computeKnownBits(X, KnownZero, KnownOne, DL, Depth + 1, AC, CxtI, DT); if ((KnownZero & *CA) == *CA && (KnownZero & *CB) == *CB) return true; } return false; }; Value *X; const APInt *CLHS, *CRHS; if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS)) return CLHS->ule(*CRHS); return false; } } } /// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred /// ALHS ARHS" is true. static bool isImpliedCondOperands(CmpInst::Predicate Pred, Value *ALHS, Value *ARHS, Value *BLHS, Value *BRHS, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { switch (Pred) { default: return false; case CmpInst::ICMP_SLT: case CmpInst::ICMP_SLE: return isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI, DT) && isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI, DT); case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE: return isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI, DT) && isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI, DT); } } bool llvm::isImpliedCondition(Value *LHS, Value *RHS, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { assert(LHS->getType() == RHS->getType() && "mismatched type"); Type *OpTy = LHS->getType(); assert(OpTy->getScalarType()->isIntegerTy(1)); // LHS ==> RHS by definition if (LHS == RHS) return true; if (OpTy->isVectorTy()) // TODO: extending the code below to handle vectors return false; assert(OpTy->isIntegerTy(1) && "implied by above"); ICmpInst::Predicate APred, BPred; Value *ALHS, *ARHS; Value *BLHS, *BRHS; if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS))) || !match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS)))) return false; if (APred == BPred) return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC, CxtI, DT); return false; } diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp index 484d31737b2e..df6be1e4242f 100644 --- a/llvm/lib/CodeGen/GCRootLowering.cpp +++ b/llvm/lib/CodeGen/GCRootLowering.cpp @@ -1,356 +1,355 @@ //===-- GCRootLowering.cpp - Garbage collection infrastructure ------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the lowering for the gc.root mechanism. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCStrategy.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; namespace { /// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or /// llvm.gcwrite intrinsics, replacing them with simple loads and stores as /// directed by the GCStrategy. It also performs automatic root initialization /// and custom intrinsic lowering. class LowerIntrinsics : public FunctionPass { bool PerformDefaultLowering(Function &F, GCStrategy &Coll); public: static char ID; LowerIntrinsics(); const char *getPassName() const override; void getAnalysisUsage(AnalysisUsage &AU) const override; bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; }; /// GCMachineCodeAnalysis - This is a target-independent pass over the machine /// function representation to identify safe points for the garbage collector /// in the machine code. It inserts labels at safe points and populates a /// GCMetadata record for each function. class GCMachineCodeAnalysis : public MachineFunctionPass { GCFunctionInfo *FI; MachineModuleInfo *MMI; const TargetInstrInfo *TII; void FindSafePoints(MachineFunction &MF); void VisitCallPoint(MachineBasicBlock::iterator MI); MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL) const; void FindStackOffsets(MachineFunction &MF); public: static char ID; GCMachineCodeAnalysis(); void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnMachineFunction(MachineFunction &MF) override; }; } // ----------------------------------------------------------------------------- INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false) INITIALIZE_PASS_DEPENDENCY(GCModuleInfo) INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false) FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); } char LowerIntrinsics::ID = 0; LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) { initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry()); } const char *LowerIntrinsics::getPassName() const { return "Lower Garbage Collection Instructions"; } void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const { FunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); } static bool NeedsDefaultLoweringPass(const GCStrategy &C) { // Default lowering is necessary only if read or write barriers have a default // action. The default for roots is no action. return !C.customWriteBarrier() || !C.customReadBarrier() || C.initializeRoots(); } /// doInitialization - If this module uses the GC intrinsics, find them now. bool LowerIntrinsics::doInitialization(Module &M) { GCModuleInfo *MI = getAnalysisIfAvailable(); assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?"); for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) if (!I->isDeclaration() && I->hasGC()) MI->getFunctionInfo(*I); // Instantiate the GC strategy. return false; } /// CouldBecomeSafePoint - Predicate to conservatively determine whether the /// instruction could introduce a safe point. static bool CouldBecomeSafePoint(Instruction *I) { // The natural definition of instructions which could introduce safe points // are: // // - call, invoke (AfterCall, BeforeCall) // - phis (Loops) // - invoke, ret, unwind (Exit) // // However, instructions as seemingly inoccuous as arithmetic can become // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead // it is necessary to take a conservative approach. if (isa(I) || isa(I) || isa(I) || isa(I)) return false; // llvm.gcroot is safe because it doesn't do anything at runtime. if (CallInst *CI = dyn_cast(I)) if (Function *F = CI->getCalledFunction()) if (Intrinsic::ID IID = F->getIntrinsicID()) if (IID == Intrinsic::gcroot) return false; return true; } static bool InsertRootInitializers(Function &F, AllocaInst **Roots, unsigned Count) { // Scroll past alloca instructions. BasicBlock::iterator IP = F.getEntryBlock().begin(); while (isa(IP)) ++IP; // Search for initializers in the initial BB. SmallPtrSet InitedRoots; for (; !CouldBecomeSafePoint(&*IP); ++IP) if (StoreInst *SI = dyn_cast(IP)) if (AllocaInst *AI = dyn_cast(SI->getOperand(1)->stripPointerCasts())) InitedRoots.insert(AI); // Add root initializers. bool MadeChange = false; for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I) if (!InitedRoots.count(*I)) { StoreInst *SI = new StoreInst( - ConstantPointerNull::get(cast( - cast((*I)->getType())->getElementType())), + ConstantPointerNull::get(cast((*I)->getAllocatedType())), *I); SI->insertAfter(*I); MadeChange = true; } return MadeChange; } /// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores. /// Leave gcroot intrinsics; the code generator needs to see those. bool LowerIntrinsics::runOnFunction(Function &F) { // Quick exit for functions that do not use GC. if (!F.hasGC()) return false; GCFunctionInfo &FI = getAnalysis().getFunctionInfo(F); GCStrategy &S = FI.getStrategy(); bool MadeChange = false; if (NeedsDefaultLoweringPass(S)) MadeChange |= PerformDefaultLowering(F, S); return MadeChange; } bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) { bool LowerWr = !S.customWriteBarrier(); bool LowerRd = !S.customReadBarrier(); bool InitRoots = S.initializeRoots(); SmallVector Roots; bool MadeChange = false; for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { if (IntrinsicInst *CI = dyn_cast(II++)) { Function *F = CI->getCalledFunction(); switch (F->getIntrinsicID()) { case Intrinsic::gcwrite: if (LowerWr) { // Replace a write barrier with a simple store. Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI); CI->replaceAllUsesWith(St); CI->eraseFromParent(); } break; case Intrinsic::gcread: if (LowerRd) { // Replace a read barrier with a simple load. Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI); Ld->takeName(CI); CI->replaceAllUsesWith(Ld); CI->eraseFromParent(); } break; case Intrinsic::gcroot: if (InitRoots) { // Initialize the GC root, but do not delete the intrinsic. The // backend needs the intrinsic to flag the stack slot. Roots.push_back( cast(CI->getArgOperand(0)->stripPointerCasts())); } break; default: continue; } MadeChange = true; } } } if (Roots.size()) MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size()); return MadeChange; } // ----------------------------------------------------------------------------- char GCMachineCodeAnalysis::ID = 0; char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID; INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis", "Analyze Machine Code For Garbage Collection", false, false) GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {} void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); AU.setPreservesAll(); AU.addRequired(); AU.addRequired(); } MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL) const { MCSymbol *Label = MBB.getParent()->getContext().createTempSymbol(); BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label); return Label; } void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) { // Find the return address (next instruction), too, so as to bracket the call // instruction. MachineBasicBlock::iterator RAI = CI; ++RAI; if (FI->getStrategy().needsSafePoint(GC::PreCall)) { MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc()); FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc()); } if (FI->getStrategy().needsSafePoint(GC::PostCall)) { MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc()); FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc()); } } void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) { for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE; ++BBI) for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end(); MI != ME; ++MI) if (MI->isCall()) { // Do not treat tail or sibling call sites as safe points. This is // legal since any arguments passed to the callee which live in the // remnants of the callers frame will be owned and updated by the // callee if required. if (MI->isTerminator()) continue; VisitCallPoint(MI); } } void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); assert(TFI && "TargetRegisterInfo not available!"); for (GCFunctionInfo::roots_iterator RI = FI->roots_begin(); RI != FI->roots_end();) { // If the root references a dead object, no need to keep it. if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) { RI = FI->removeStackRoot(RI); } else { unsigned FrameReg; // FIXME: surely GCRoot ought to store the // register that the offset is from? RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg); ++RI; } } } bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) { // Quick exit for functions that do not use GC. if (!MF.getFunction()->hasGC()) return false; FI = &getAnalysis().getFunctionInfo(*MF.getFunction()); MMI = &getAnalysis(); TII = MF.getSubtarget().getInstrInfo(); // Find the size of the stack frame. There may be no correct static frame // size, we use UINT64_MAX to represent this. const MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); const bool DynamicFrameSize = MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF); FI->setFrameSize(DynamicFrameSize ? UINT64_MAX : MFI->getStackSize()); // Find all safe points. if (FI->getStrategy().needsSafePoints()) FindSafePoints(MF); // Find the concrete stack offsets for all roots (stack slots) FindStackOffsets(MF); return false; } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index d308f5dd9ebf..f6299597b699 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1,3233 +1,3233 @@ //===- GlobalOpt.cpp - Optimize Global Variables --------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass transforms simple global variables that never have their address // taken. If obviously true, it marks read/write globals as constant, deletes // variables only stored to, etc. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include using namespace llvm; #define DEBUG_TYPE "globalopt" STATISTIC(NumMarked , "Number of globals marked constant"); STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr"); STATISTIC(NumSRA , "Number of aggregate globals broken into scalars"); STATISTIC(NumHeapSRA , "Number of heap objects SRA'd"); STATISTIC(NumSubstitute,"Number of globals with initializers stored into them"); STATISTIC(NumDeleted , "Number of globals deleted"); STATISTIC(NumGlobUses , "Number of global uses devirtualized"); STATISTIC(NumLocalized , "Number of globals localized"); STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans"); STATISTIC(NumFastCallFns , "Number of functions converted to fastcc"); STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated"); STATISTIC(NumNestRemoved , "Number of nest attributes removed"); STATISTIC(NumAliasesResolved, "Number of global aliases resolved"); STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); namespace { struct GlobalOpt : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); } static char ID; // Pass identification, replacement for typeid GlobalOpt() : ModulePass(ID) { initializeGlobalOptPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override; private: bool OptimizeFunctions(Module &M); bool OptimizeGlobalVars(Module &M); bool OptimizeGlobalAliases(Module &M); bool deleteIfDead(GlobalValue &GV); bool processGlobal(GlobalValue &GV); bool processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS); bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn); bool isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV); TargetLibraryInfo *TLI; SmallSet NotDiscardableComdats; }; } char GlobalOpt::ID = 0; INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(GlobalOpt, "globalopt", "Global Variable Optimizer", false, false) ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } /// Is this global variable possibly used by a leak checker as a root? If so, /// we might not really want to eliminate the stores to it. static bool isLeakCheckerRoot(GlobalVariable *GV) { // A global variable is a root if it is a pointer, or could plausibly contain // a pointer. There are two challenges; one is that we could have a struct // the has an inner member which is a pointer. We recurse through the type to // detect these (up to a point). The other is that we may actually be a union // of a pointer and another type, and so our LLVM type is an integer which // gets converted into a pointer, or our type is an [i8 x #] with a pointer // potentially contained here. if (GV->hasPrivateLinkage()) return false; SmallVector Types; Types.push_back(GV->getValueType()); unsigned Limit = 20; do { Type *Ty = Types.pop_back_val(); switch (Ty->getTypeID()) { default: break; case Type::PointerTyID: return true; case Type::ArrayTyID: case Type::VectorTyID: { SequentialType *STy = cast(Ty); Types.push_back(STy->getElementType()); break; } case Type::StructTyID: { StructType *STy = cast(Ty); if (STy->isOpaque()) return true; for (StructType::element_iterator I = STy->element_begin(), E = STy->element_end(); I != E; ++I) { Type *InnerTy = *I; if (isa(InnerTy)) return true; if (isa(InnerTy)) Types.push_back(InnerTy); } break; } } if (--Limit == 0) return true; } while (!Types.empty()); return false; } /// Given a value that is stored to a global but never read, determine whether /// it's safe to remove the store and the chain of computation that feeds the /// store. static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { do { if (isa(V)) return true; if (!V->hasOneUse()) return false; if (isa(V) || isa(V) || isa(V) || isa(V)) return false; if (isAllocationFn(V, TLI)) return true; Instruction *I = cast(V); if (I->mayHaveSideEffects()) return false; if (GetElementPtrInst *GEP = dyn_cast(I)) { if (!GEP->hasAllConstantIndices()) return false; } else if (I->getNumOperands() != 1) { return false; } V = I->getOperand(0); } while (1); } /// This GV is a pointer root. Loop over all users of the global and clean up /// any that obviously don't assign the global a value that isn't dynamically /// allocated. static bool CleanupPointerRootUsers(GlobalVariable *GV, const TargetLibraryInfo *TLI) { // A brief explanation of leak checkers. The goal is to find bugs where // pointers are forgotten, causing an accumulating growth in memory // usage over time. The common strategy for leak checkers is to whitelist the // memory pointed to by globals at exit. This is popular because it also // solves another problem where the main thread of a C++ program may shut down // before other threads that are still expecting to use those globals. To // handle that case, we expect the program may create a singleton and never // destroy it. bool Changed = false; // If Dead[n].first is the only use of a malloc result, we can delete its // chain of computation and the store to the global in Dead[n].second. SmallVector, 32> Dead; // Constants can't be pointers to dynamically allocated memory. for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end(); UI != E;) { User *U = *UI++; if (StoreInst *SI = dyn_cast(U)) { Value *V = SI->getValueOperand(); if (isa(V)) { Changed = true; SI->eraseFromParent(); } else if (Instruction *I = dyn_cast(V)) { if (I->hasOneUse()) Dead.push_back(std::make_pair(I, SI)); } } else if (MemSetInst *MSI = dyn_cast(U)) { if (isa(MSI->getValue())) { Changed = true; MSI->eraseFromParent(); } else if (Instruction *I = dyn_cast(MSI->getValue())) { if (I->hasOneUse()) Dead.push_back(std::make_pair(I, MSI)); } } else if (MemTransferInst *MTI = dyn_cast(U)) { GlobalVariable *MemSrc = dyn_cast(MTI->getSource()); if (MemSrc && MemSrc->isConstant()) { Changed = true; MTI->eraseFromParent(); } else if (Instruction *I = dyn_cast(MemSrc)) { if (I->hasOneUse()) Dead.push_back(std::make_pair(I, MTI)); } } else if (ConstantExpr *CE = dyn_cast(U)) { if (CE->use_empty()) { CE->destroyConstant(); Changed = true; } } else if (Constant *C = dyn_cast(U)) { if (isSafeToDestroyConstant(C)) { C->destroyConstant(); // This could have invalidated UI, start over from scratch. Dead.clear(); CleanupPointerRootUsers(GV, TLI); return true; } } } for (int i = 0, e = Dead.size(); i != e; ++i) { if (IsSafeComputationToRemove(Dead[i].first, TLI)) { Dead[i].second->eraseFromParent(); Instruction *I = Dead[i].first; do { if (isAllocationFn(I, TLI)) break; Instruction *J = dyn_cast(I->getOperand(0)); if (!J) break; I->eraseFromParent(); I = J; } while (1); I->eraseFromParent(); } } return Changed; } /// We just marked GV constant. Loop over all users of the global, cleaning up /// the obvious ones. This is largely just a quick scan over the use list to /// clean up the easy and obvious cruft. This returns true if it made a change. static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, const DataLayout &DL, TargetLibraryInfo *TLI) { bool Changed = false; // Note that we need to use a weak value handle for the worklist items. When // we delete a constant array, we may also be holding pointer to one of its // elements (or an element of one of its elements if we're dealing with an // array of arrays) in the worklist. SmallVector WorkList(V->user_begin(), V->user_end()); while (!WorkList.empty()) { Value *UV = WorkList.pop_back_val(); if (!UV) continue; User *U = cast(UV); if (LoadInst *LI = dyn_cast(U)) { if (Init) { // Replace the load with the initializer. LI->replaceAllUsesWith(Init); LI->eraseFromParent(); Changed = true; } } else if (StoreInst *SI = dyn_cast(U)) { // Store must be unreachable or storing Init into the global. SI->eraseFromParent(); Changed = true; } else if (ConstantExpr *CE = dyn_cast(U)) { if (CE->getOpcode() == Instruction::GetElementPtr) { Constant *SubInit = nullptr; if (Init) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI); } else if ((CE->getOpcode() == Instruction::BitCast && CE->getType()->isPointerTy()) || CE->getOpcode() == Instruction::AddrSpaceCast) { // Pointer cast, delete any stores and memsets to the global. Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI); } if (CE->use_empty()) { CE->destroyConstant(); Changed = true; } } else if (GetElementPtrInst *GEP = dyn_cast(U)) { // Do not transform "gepinst (gep constexpr (GV))" here, because forming // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold // and will invalidate our notion of what Init is. Constant *SubInit = nullptr; if (!isa(GEP->getOperand(0))) { ConstantExpr *CE = dyn_cast_or_null( ConstantFoldInstruction(GEP, DL, TLI)); if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); // If the initializer is an all-null value and we have an inbounds GEP, // we already know what the result of any load from that GEP is. // TODO: Handle splats. if (Init && isa(Init) && GEP->isInBounds()) SubInit = Constant::getNullValue(GEP->getType()->getElementType()); } Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI); if (GEP->use_empty()) { GEP->eraseFromParent(); Changed = true; } } else if (MemIntrinsic *MI = dyn_cast(U)) { // memset/cpy/mv if (MI->getRawDest() == V) { MI->eraseFromParent(); Changed = true; } } else if (Constant *C = dyn_cast(U)) { // If we have a chain of dead constantexprs or other things dangling from // us, and if they are all dead, nuke them without remorse. if (isSafeToDestroyConstant(C)) { C->destroyConstant(); CleanupConstantGlobalUsers(V, Init, DL, TLI); return true; } } } return Changed; } /// Return true if the specified instruction is a safe user of a derived /// expression from a global that we want to SROA. static bool isSafeSROAElementUse(Value *V) { // We might have a dead and dangling constant hanging off of here. if (Constant *C = dyn_cast(V)) return isSafeToDestroyConstant(C); Instruction *I = dyn_cast(V); if (!I) return false; // Loads are ok. if (isa(I)) return true; // Stores *to* the pointer are ok. if (StoreInst *SI = dyn_cast(I)) return SI->getOperand(0) != V; // Otherwise, it must be a GEP. GetElementPtrInst *GEPI = dyn_cast(I); if (!GEPI) return false; if (GEPI->getNumOperands() < 3 || !isa(GEPI->getOperand(1)) || !cast(GEPI->getOperand(1))->isNullValue()) return false; for (User *U : GEPI->users()) if (!isSafeSROAElementUse(U)) return false; return true; } /// U is a direct user of the specified global value. Look at it and its uses /// and decide whether it is safe to SROA this global. static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { // The user of the global must be a GEP Inst or a ConstantExpr GEP. if (!isa(U) && (!isa(U) || cast(U)->getOpcode() != Instruction::GetElementPtr)) return false; // Check to see if this ConstantExpr GEP is SRA'able. In particular, we // don't like < 3 operand CE's, and we don't like non-constant integer // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some // value of C. if (U->getNumOperands() < 3 || !isa(U->getOperand(1)) || !cast(U->getOperand(1))->isNullValue() || !isa(U->getOperand(2))) return false; gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U); ++GEPI; // Skip over the pointer index. // If this is a use of an array allocation, do a bit more checking for sanity. if (ArrayType *AT = dyn_cast(*GEPI)) { uint64_t NumElements = AT->getNumElements(); ConstantInt *Idx = cast(U->getOperand(2)); // Check to make sure that index falls within the array. If not, // something funny is going on, so we won't do the optimization. // if (Idx->getZExtValue() >= NumElements) return false; // We cannot scalar repl this level of the array unless any array // sub-indices are in-range constants. In particular, consider: // A[0][i]. We cannot know that the user isn't doing invalid things like // allowing i to index an out-of-range subscript that accesses A[1]. // // Scalar replacing *just* the outer index of the array is probably not // going to be a win anyway, so just give up. for (++GEPI; // Skip array index. GEPI != E; ++GEPI) { uint64_t NumElements; if (ArrayType *SubArrayTy = dyn_cast(*GEPI)) NumElements = SubArrayTy->getNumElements(); else if (VectorType *SubVectorTy = dyn_cast(*GEPI)) NumElements = SubVectorTy->getNumElements(); else { assert((*GEPI)->isStructTy() && "Indexed GEP type is not array, vector, or struct!"); continue; } ConstantInt *IdxVal = dyn_cast(GEPI.getOperand()); if (!IdxVal || IdxVal->getZExtValue() >= NumElements) return false; } } for (User *UU : U->users()) if (!isSafeSROAElementUse(UU)) return false; return true; } /// Look at all uses of the global and decide whether it is safe for us to /// perform this transformation. static bool GlobalUsersSafeToSRA(GlobalValue *GV) { for (User *U : GV->users()) if (!IsUserOfGlobalSafeForSRA(U, GV)) return false; return true; } /// Perform scalar replacement of aggregates on the specified global variable. /// This opens the door for other optimizations by exposing the behavior of the /// program in a more fine-grained way. We have determined that this /// transformation is safe already. We return the first global variable we /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. if (!GlobalUsersSafeToSRA(GV)) return nullptr; assert(GV->hasLocalLinkage() && !GV->isConstant()); Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); std::vector NewGlobals; Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); // Get the alignment of the global, either explicit or target-specific. unsigned StartAlignment = GV->getAlignment(); if (StartAlignment == 0) StartAlignment = DL.getABITypeAlignment(GV->getType()); if (StructType *STy = dyn_cast(Ty)) { NewGlobals.reserve(STy->getNumElements()); const StructLayout &Layout = *DL.getStructLayout(STy); for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); NGV->setExternallyInitialized(GV->isExternallyInitialized()); Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. uint64_t FieldOffset = Layout.getElementOffset(i); unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset); if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i))) NGV->setAlignment(NewAlign); } } else if (SequentialType *STy = dyn_cast(Ty)) { unsigned NumElements = 0; if (ArrayType *ATy = dyn_cast(STy)) NumElements = ATy->getNumElements(); else NumElements = cast(STy)->getNumElements(); if (NumElements > 16 && GV->hasNUsesOrMore(16)) return nullptr; // It's not worth it. NewGlobals.reserve(NumElements); uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType()); unsigned EltAlign = DL.getABITypeAlignment(STy->getElementType()); for (unsigned i = 0, e = NumElements; i != e; ++i) { Constant *In = Init->getAggregateElement(i); assert(In && "Couldn't get element of initializer?"); GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); NGV->setExternallyInitialized(GV->isExternallyInitialized()); Globals.push_back(NGV); NewGlobals.push_back(NGV); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i); if (NewAlign > EltAlign) NGV->setAlignment(NewAlign); } } if (NewGlobals.empty()) return nullptr; DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); // Loop over all of the uses of the global, replacing the constantexpr geps, // with smaller constantexpr geps or direct references. while (!GV->use_empty()) { User *GEP = GV->user_back(); assert(((isa(GEP) && cast(GEP)->getOpcode()==Instruction::GetElementPtr)|| isa(GEP)) && "NonGEP CE's are not SRAable!"); // Ignore the 1th operand, which has to be zero or else the program is quite // broken (undefined). Get the 2nd operand, which is the structure or array // index. unsigned Val = cast(GEP->getOperand(2))->getZExtValue(); if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access. Value *NewPtr = NewGlobals[Val]; Type *NewTy = NewGlobals[Val]->getValueType(); // Form a shorter GEP if needed. if (GEP->getNumOperands() > 3) { if (ConstantExpr *CE = dyn_cast(GEP)) { SmallVector Idxs; Idxs.push_back(NullInt); for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) Idxs.push_back(CE->getOperand(i)); NewPtr = ConstantExpr::getGetElementPtr(NewTy, cast(NewPtr), Idxs); } else { GetElementPtrInst *GEPI = cast(GEP); SmallVector Idxs; Idxs.push_back(NullInt); for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) Idxs.push_back(GEPI->getOperand(i)); NewPtr = GetElementPtrInst::Create( NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI); } } GEP->replaceAllUsesWith(NewPtr); if (GetElementPtrInst *GEPI = dyn_cast(GEP)) GEPI->eraseFromParent(); else cast(GEP)->destroyConstant(); } // Delete the old global, now that it is dead. Globals.erase(GV); ++NumSRA; // Loop over the new globals array deleting any globals that are obviously // dead. This can arise due to scalarization of a structure or an array that // has elements that are dead. unsigned FirstGlobal = 0; for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i) if (NewGlobals[i]->use_empty()) { Globals.erase(NewGlobals[i]); if (FirstGlobal == i) ++FirstGlobal; } return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; } /// Return true if all users of the specified value will trap if the value is /// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid /// reprocessing them. static bool AllUsesOfValueWillTrapIfNull(const Value *V, SmallPtrSetImpl &PHIs) { for (const User *U : V->users()) if (isa(U)) { // Will trap. } else if (const StoreInst *SI = dyn_cast(U)) { if (SI->getOperand(0) == V) { //cerr << "NONTRAPPING USE: " << *U; return false; // Storing the value. } } else if (const CallInst *CI = dyn_cast(U)) { if (CI->getCalledValue() != V) { //cerr << "NONTRAPPING USE: " << *U; return false; // Not calling the ptr } } else if (const InvokeInst *II = dyn_cast(U)) { if (II->getCalledValue() != V) { //cerr << "NONTRAPPING USE: " << *U; return false; // Not calling the ptr } } else if (const BitCastInst *CI = dyn_cast(U)) { if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false; } else if (const GetElementPtrInst *GEPI = dyn_cast(U)) { if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false; } else if (const PHINode *PN = dyn_cast(U)) { // If we've already seen this phi node, ignore it, it has already been // checked. if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) return false; } else if (isa(U) && isa(U->getOperand(1))) { // Ignore icmp X, null } else { //cerr << "NONTRAPPING USE: " << *U; return false; } return true; } /// Return true if all uses of any loads from GV will trap if the loaded value /// is null. Note that this also permits comparisons of the loaded value /// against null, as a special case. static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { for (const User *U : GV->users()) if (const LoadInst *LI = dyn_cast(U)) { SmallPtrSet PHIs; if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) return false; } else if (isa(U)) { // Ignore stores to the global. } else { // We don't know or understand this user, bail out. //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; return false; } return true; } static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { bool Changed = false; for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) { Instruction *I = cast(*UI++); if (LoadInst *LI = dyn_cast(I)) { LI->setOperand(0, NewV); Changed = true; } else if (StoreInst *SI = dyn_cast(I)) { if (SI->getOperand(1) == V) { SI->setOperand(1, NewV); Changed = true; } } else if (isa(I) || isa(I)) { CallSite CS(I); if (CS.getCalledValue() == V) { // Calling through the pointer! Turn into a direct call, but be careful // that the pointer is not also being passed as an argument. CS.setCalledFunction(NewV); Changed = true; bool PassedAsArg = false; for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) if (CS.getArgument(i) == V) { PassedAsArg = true; CS.setArgument(i, NewV); } if (PassedAsArg) { // Being passed as an argument also. Be careful to not invalidate UI! UI = V->user_begin(); } } } else if (CastInst *CI = dyn_cast(I)) { Changed |= OptimizeAwayTrappingUsesOfValue(CI, ConstantExpr::getCast(CI->getOpcode(), NewV, CI->getType())); if (CI->use_empty()) { Changed = true; CI->eraseFromParent(); } } else if (GetElementPtrInst *GEPI = dyn_cast(I)) { // Should handle GEP here. SmallVector Idxs; Idxs.reserve(GEPI->getNumOperands()-1); for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end(); i != e; ++i) if (Constant *C = dyn_cast(*i)) Idxs.push_back(C); else break; if (Idxs.size() == GEPI->getNumOperands()-1) Changed |= OptimizeAwayTrappingUsesOfValue( GEPI, ConstantExpr::getGetElementPtr(nullptr, NewV, Idxs)); if (GEPI->use_empty()) { Changed = true; GEPI->eraseFromParent(); } } } return Changed; } /// The specified global has only one non-null value stored into it. If there /// are uses of the loaded value that would trap if the loaded value is /// dynamically null, then we know that they cannot be reachable with a null /// optimize away the load. static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, const DataLayout &DL, TargetLibraryInfo *TLI) { bool Changed = false; // Keep track of whether we are able to remove all the uses of the global // other than the store that defines it. bool AllNonStoreUsesGone = true; // Replace all uses of loads with uses of uses of the stored value. for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){ User *GlobalUser = *GUI++; if (LoadInst *LI = dyn_cast(GlobalUser)) { Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV); // If we were able to delete all uses of the loads if (LI->use_empty()) { LI->eraseFromParent(); Changed = true; } else { AllNonStoreUsesGone = false; } } else if (isa(GlobalUser)) { // Ignore the store that stores "LV" to the global. assert(GlobalUser->getOperand(1) == GV && "Must be storing *to* the global"); } else { AllNonStoreUsesGone = false; // If we get here we could have other crazy uses that are transitively // loaded. assert((isa(GlobalUser) || isa(GlobalUser) || isa(GlobalUser) || isa(GlobalUser) || isa(GlobalUser) || isa(GlobalUser)) && "Only expect load and stores!"); } } if (Changed) { DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV << "\n"); ++NumGlobUses; } // If we nuked all of the loads, then none of the stores are needed either, // nor is the global. if (AllNonStoreUsesGone) { if (isLeakCheckerRoot(GV)) { Changed |= CleanupPointerRootUsers(GV, TLI); } else { Changed = true; CleanupConstantGlobalUsers(GV, nullptr, DL, TLI); } if (GV->use_empty()) { DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); Changed = true; GV->eraseFromParent(); ++NumDeleted; } } return Changed; } /// Walk the use list of V, constant folding all of the instructions that are /// foldable. static void ConstantPropUsersOf(Value *V, const DataLayout &DL, TargetLibraryInfo *TLI) { for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; ) if (Instruction *I = dyn_cast(*UI++)) if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) { I->replaceAllUsesWith(NewC); // Advance UI to the next non-I use to avoid invalidating it! // Instructions could multiply use V. while (UI != E && *UI == I) ++UI; I->eraseFromParent(); } } /// This function takes the specified global variable, and transforms the /// program as if it always contained the result of the specified malloc. /// Because it is always the result of the specified malloc, there is no reason /// to actually DO the malloc. Instead, turn the malloc into a global, and any /// loads of GV as uses of the new global. static GlobalVariable * OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, ConstantInt *NElements, const DataLayout &DL, TargetLibraryInfo *TLI) { DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); Type *GlobalType; if (NElements->getZExtValue() == 1) GlobalType = AllocTy; else // If we have an array allocation, the global variable is of an array. GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue()); // Create the new global variable. The contents of the malloc'd memory is // undefined, so initialize with an undef value. GlobalVariable *NewGV = new GlobalVariable( *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage, UndefValue::get(GlobalType), GV->getName() + ".body", nullptr, GV->getThreadLocalMode()); // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update // other users to use the global as well. BitCastInst *TheBC = nullptr; while (!CI->use_empty()) { Instruction *User = cast(CI->user_back()); if (BitCastInst *BCI = dyn_cast(User)) { if (BCI->getType() == NewGV->getType()) { BCI->replaceAllUsesWith(NewGV); BCI->eraseFromParent(); } else { BCI->setOperand(0, NewGV); } } else { if (!TheBC) TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI); User->replaceUsesOfWith(CI, TheBC); } } Constant *RepValue = NewGV; if (NewGV->getType() != GV->getValueType()) RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType()); // If there is a comparison against null, we will insert a global bool to // keep track of whether the global was initialized yet or not. GlobalVariable *InitBool = new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), GV->getName()+".init", GV->getThreadLocalMode()); bool InitBoolUsed = false; // Loop over all uses of GV, processing them in turn. while (!GV->use_empty()) { if (StoreInst *SI = dyn_cast(GV->user_back())) { // The global is initialized when the store to it occurs. new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0, SI->getOrdering(), SI->getSynchScope(), SI); SI->eraseFromParent(); continue; } LoadInst *LI = cast(GV->user_back()); while (!LI->use_empty()) { Use &LoadUse = *LI->use_begin(); ICmpInst *ICI = dyn_cast(LoadUse.getUser()); if (!ICI) { LoadUse = RepValue; continue; } // Replace the cmp X, 0 with a use of the bool value. // Sink the load to where the compare was, if atomic rules allow us to. Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0, LI->getOrdering(), LI->getSynchScope(), LI->isUnordered() ? (Instruction*)ICI : LI); InitBoolUsed = true; switch (ICI->getPredicate()) { default: llvm_unreachable("Unknown ICmp Predicate!"); case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_SLT: // X < null -> always false LV = ConstantInt::getFalse(GV->getContext()); break; case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_EQ: LV = BinaryOperator::CreateNot(LV, "notinit", ICI); break; case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGE: case ICmpInst::ICMP_SGE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_SGT: break; // no change. } ICI->replaceAllUsesWith(LV); ICI->eraseFromParent(); } LI->eraseFromParent(); } // If the initialization boolean was used, insert it, otherwise delete it. if (!InitBoolUsed) { while (!InitBool->use_empty()) // Delete initializations cast(InitBool->user_back())->eraseFromParent(); delete InitBool; } else GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool); // Now the GV is dead, nuke it and the malloc.. GV->eraseFromParent(); CI->eraseFromParent(); // To further other optimizations, loop over all users of NewGV and try to // constant prop them. This will promote GEP instructions with constant // indices into GEP constant-exprs, which will allow global-opt to hack on it. ConstantPropUsersOf(NewGV, DL, TLI); if (RepValue != NewGV) ConstantPropUsersOf(RepValue, DL, TLI); return NewGV; } /// Scan the use-list of V checking to make sure that there are no complex uses /// of V. We permit simple things like dereferencing the pointer, but not /// storing through the address, unless it is to the specified global. static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, const GlobalVariable *GV, SmallPtrSetImpl &PHIs) { for (const User *U : V->users()) { const Instruction *Inst = cast(U); if (isa(Inst) || isa(Inst)) { continue; // Fine, ignore. } if (const StoreInst *SI = dyn_cast(Inst)) { if (SI->getOperand(0) == V && SI->getOperand(1) != GV) return false; // Storing the pointer itself... bad. continue; // Otherwise, storing through it, or storing into GV... fine. } // Must index into the array and into the struct. if (isa(Inst) && Inst->getNumOperands() >= 3) { if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs)) return false; continue; } if (const PHINode *PN = dyn_cast(Inst)) { // PHIs are ok if all uses are ok. Don't infinitely recurse through PHI // cycles. if (PHIs.insert(PN).second) if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs)) return false; continue; } if (const BitCastInst *BCI = dyn_cast(Inst)) { if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs)) return false; continue; } return false; } return true; } /// The Alloc pointer is stored into GV somewhere. Transform all uses of the /// allocation into loads from the global and uses of the resultant pointer. /// Further, delete the store into GV. This assumes that these value pass the /// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate. static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, GlobalVariable *GV) { while (!Alloc->use_empty()) { Instruction *U = cast(*Alloc->user_begin()); Instruction *InsertPt = U; if (StoreInst *SI = dyn_cast(U)) { // If this is the store of the allocation into the global, remove it. if (SI->getOperand(1) == GV) { SI->eraseFromParent(); continue; } } else if (PHINode *PN = dyn_cast(U)) { // Insert the load in the corresponding predecessor, not right before the // PHI. InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator(); } else if (isa(U)) { // Must be bitcast between the malloc and store to initialize the global. ReplaceUsesOfMallocWithGlobal(U, GV); U->eraseFromParent(); continue; } else if (GetElementPtrInst *GEPI = dyn_cast(U)) { // If this is a "GEP bitcast" and the user is a store to the global, then // just process it as a bitcast. if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse()) if (StoreInst *SI = dyn_cast(GEPI->user_back())) if (SI->getOperand(1) == GV) { // Must be bitcast GEP between the malloc and store to initialize // the global. ReplaceUsesOfMallocWithGlobal(GEPI, GV); GEPI->eraseFromParent(); continue; } } // Insert a load from the global, and use it instead of the malloc. Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt); U->replaceUsesOfWith(Alloc, NL); } } /// Verify that all uses of V (a load, or a phi of a load) are simple enough to /// perform heap SRA on. This permits GEP's that index through the array and /// struct field, icmps of null, and PHIs. static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, SmallPtrSetImpl &LoadUsingPHIs, SmallPtrSetImpl &LoadUsingPHIsPerLoad) { // We permit two users of the load: setcc comparing against the null // pointer, and a getelementptr of a specific form. for (const User *U : V->users()) { const Instruction *UI = cast(U); // Comparison against null is ok. if (const ICmpInst *ICI = dyn_cast(UI)) { if (!isa(ICI->getOperand(1))) return false; continue; } // getelementptr is also ok, but only a simple form. if (const GetElementPtrInst *GEPI = dyn_cast(UI)) { // Must index into the array and into the struct. if (GEPI->getNumOperands() < 3) return false; // Otherwise the GEP is ok. continue; } if (const PHINode *PN = dyn_cast(UI)) { if (!LoadUsingPHIsPerLoad.insert(PN).second) // This means some phi nodes are dependent on each other. // Avoid infinite looping! return false; if (!LoadUsingPHIs.insert(PN).second) // If we have already analyzed this PHI, then it is safe. continue; // Make sure all uses of the PHI are simple enough to transform. if (!LoadUsesSimpleEnoughForHeapSRA(PN, LoadUsingPHIs, LoadUsingPHIsPerLoad)) return false; continue; } // Otherwise we don't know what this is, not ok. return false; } return true; } /// If all users of values loaded from GV are simple enough to perform HeapSRA, /// return true. static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, Instruction *StoredVal) { SmallPtrSet LoadUsingPHIs; SmallPtrSet LoadUsingPHIsPerLoad; for (const User *U : GV->users()) if (const LoadInst *LI = dyn_cast(U)) { if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs, LoadUsingPHIsPerLoad)) return false; LoadUsingPHIsPerLoad.clear(); } // If we reach here, we know that all uses of the loads and transitive uses // (through PHI nodes) are simple enough to transform. However, we don't know // that all inputs the to the PHI nodes are in the same equivalence sets. // Check to verify that all operands of the PHIs are either PHIS that can be // transformed, loads from GV, or MI itself. for (const PHINode *PN : LoadUsingPHIs) { for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) { Value *InVal = PN->getIncomingValue(op); // PHI of the stored value itself is ok. if (InVal == StoredVal) continue; if (const PHINode *InPN = dyn_cast(InVal)) { // One of the PHIs in our set is (optimistically) ok. if (LoadUsingPHIs.count(InPN)) continue; return false; } // Load from GV is ok. if (const LoadInst *LI = dyn_cast(InVal)) if (LI->getOperand(0) == GV) continue; // UNDEF? NULL? // Anything else is rejected. return false; } } return true; } static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, DenseMap > &InsertedScalarizedValues, std::vector > &PHIsToRewrite) { std::vector &FieldVals = InsertedScalarizedValues[V]; if (FieldNo >= FieldVals.size()) FieldVals.resize(FieldNo+1); // If we already have this value, just reuse the previously scalarized // version. if (Value *FieldVal = FieldVals[FieldNo]) return FieldVal; // Depending on what instruction this is, we have several cases. Value *Result; if (LoadInst *LI = dyn_cast(V)) { // This is a scalarized version of the load from the global. Just create // a new Load of the scalarized global. Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo, InsertedScalarizedValues, PHIsToRewrite), LI->getName()+".f"+Twine(FieldNo), LI); } else { PHINode *PN = cast(V); // PN's type is pointer to struct. Make a new PHI of pointer to struct // field. PointerType *PTy = cast(PN->getType()); StructType *ST = cast(PTy->getElementType()); unsigned AS = PTy->getAddressSpace(); PHINode *NewPN = PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS), PN->getNumIncomingValues(), PN->getName()+".f"+Twine(FieldNo), PN); Result = NewPN; PHIsToRewrite.push_back(std::make_pair(PN, FieldNo)); } return FieldVals[FieldNo] = Result; } /// Given a load instruction and a value derived from the load, rewrite the /// derived value to use the HeapSRoA'd load. static void RewriteHeapSROALoadUser(Instruction *LoadUser, DenseMap > &InsertedScalarizedValues, std::vector > &PHIsToRewrite) { // If this is a comparison against null, handle it. if (ICmpInst *SCI = dyn_cast(LoadUser)) { assert(isa(SCI->getOperand(1))); // If we have a setcc of the loaded pointer, we can use a setcc of any // field. Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0, InsertedScalarizedValues, PHIsToRewrite); Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr, Constant::getNullValue(NPtr->getType()), SCI->getName()); SCI->replaceAllUsesWith(New); SCI->eraseFromParent(); return; } // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...' if (GetElementPtrInst *GEPI = dyn_cast(LoadUser)) { assert(GEPI->getNumOperands() >= 3 && isa(GEPI->getOperand(2)) && "Unexpected GEPI!"); // Load the pointer for this field. unsigned FieldNo = cast(GEPI->getOperand(2))->getZExtValue(); Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo, InsertedScalarizedValues, PHIsToRewrite); // Create the new GEP idx vector. SmallVector GEPIdx; GEPIdx.push_back(GEPI->getOperand(1)); GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end()); Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx, GEPI->getName(), GEPI); GEPI->replaceAllUsesWith(NGEPI); GEPI->eraseFromParent(); return; } // Recursively transform the users of PHI nodes. This will lazily create the // PHIs that are needed for individual elements. Keep track of what PHIs we // see in InsertedScalarizedValues so that we don't get infinite loops (very // antisocial). If the PHI is already in InsertedScalarizedValues, it has // already been seen first by another load, so its uses have already been // processed. PHINode *PN = cast(LoadUser); if (!InsertedScalarizedValues.insert(std::make_pair(PN, std::vector())).second) return; // If this is the first time we've seen this PHI, recursively process all // users. for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) { Instruction *User = cast(*UI++); RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); } } /// We are performing Heap SRoA on a global. Ptr is a value loaded from the /// global. Eliminate all uses of Ptr, making them use FieldGlobals instead. /// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA. static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, DenseMap > &InsertedScalarizedValues, std::vector > &PHIsToRewrite) { for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) { Instruction *User = cast(*UI++); RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); } if (Load->use_empty()) { Load->eraseFromParent(); InsertedScalarizedValues.erase(Load); } } /// CI is an allocation of an array of structures. Break it up into multiple /// allocations of arrays of the fields. static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, Value *NElems, const DataLayout &DL, const TargetLibraryInfo *TLI) { DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n'); Type *MAT = getMallocAllocatedType(CI, TLI); StructType *STy = cast(MAT); // There is guaranteed to be at least one use of the malloc (storing // it into GV). If there are other uses, change them to be uses of // the global to simplify later code. This also deletes the store // into GV. ReplaceUsesOfMallocWithGlobal(CI, GV); // Okay, at this point, there are no users of the malloc. Insert N // new mallocs at the same place as CI, and N globals. std::vector FieldGlobals; std::vector FieldMallocs; unsigned AS = GV->getType()->getPointerAddressSpace(); for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){ Type *FieldTy = STy->getElementType(FieldNo); PointerType *PFieldTy = PointerType::get(FieldTy, AS); GlobalVariable *NGV = new GlobalVariable( *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), nullptr, GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); unsigned TypeSize = DL.getTypeAllocSize(FieldTy); if (StructType *ST = dyn_cast(FieldTy)) TypeSize = DL.getStructLayout(ST)->getSizeInBytes(); Type *IntPtrTy = DL.getIntPtrType(CI->getType()); Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, ConstantInt::get(IntPtrTy, TypeSize), NElems, nullptr, CI->getName() + ".f" + Twine(FieldNo)); FieldMallocs.push_back(NMI); new StoreInst(NMI, NGV, CI); } // The tricky aspect of this transformation is handling the case when malloc // fails. In the original code, malloc failing would set the result pointer // of malloc to null. In this case, some mallocs could succeed and others // could fail. As such, we emit code that looks like this: // F0 = malloc(field0) // F1 = malloc(field1) // F2 = malloc(field2) // if (F0 == 0 || F1 == 0 || F2 == 0) { // if (F0) { free(F0); F0 = 0; } // if (F1) { free(F1); F1 = 0; } // if (F2) { free(F2); F2 = 0; } // } // The malloc can also fail if its argument is too large. Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0); Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0), ConstantZero, "isneg"); for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) { Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i], Constant::getNullValue(FieldMallocs[i]->getType()), "isnull"); RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI); } // Split the basic block at the old malloc. BasicBlock *OrigBB = CI->getParent(); BasicBlock *ContBB = OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont"); // Create the block to check the first condition. Put all these blocks at the // end of the function as they are unlikely to be executed. BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(), "malloc_ret_null", OrigBB->getParent()); // Remove the uncond branch from OrigBB to ContBB, turning it into a cond // branch on RunningOr. OrigBB->getTerminator()->eraseFromParent(); BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB); // Within the NullPtrBlock, we need to emit a comparison and branch for each // pointer, because some may be null while others are not. for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock); Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal, Constant::getNullValue(GVVal->getType())); BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it", OrigBB->getParent()); BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next", OrigBB->getParent()); Instruction *BI = BranchInst::Create(FreeBlock, NextBlock, Cmp, NullPtrBlock); // Fill in FreeBlock. CallInst::CreateFree(GVVal, BI); new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i], FreeBlock); BranchInst::Create(NextBlock, FreeBlock); NullPtrBlock = NextBlock; } BranchInst::Create(ContBB, NullPtrBlock); // CI is no longer needed, remove it. CI->eraseFromParent(); /// As we process loads, if we can't immediately update all uses of the load, /// keep track of what scalarized loads are inserted for a given load. DenseMap > InsertedScalarizedValues; InsertedScalarizedValues[GV] = FieldGlobals; std::vector > PHIsToRewrite; // Okay, the malloc site is completely handled. All of the uses of GV are now // loads, and all uses of those loads are simple. Rewrite them to use loads // of the per-field globals instead. for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) { Instruction *User = cast(*UI++); if (LoadInst *LI = dyn_cast(User)) { RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite); continue; } // Must be a store of null. StoreInst *SI = cast(User); assert(isa(SI->getOperand(0)) && "Unexpected heap-sra user!"); // Insert a store of null into each global. for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { Type *ValTy = cast(FieldGlobals[i])->getValueType(); Constant *Null = Constant::getNullValue(ValTy); new StoreInst(Null, FieldGlobals[i], SI); } // Erase the original store. SI->eraseFromParent(); } // While we have PHIs that are interesting to rewrite, do it. while (!PHIsToRewrite.empty()) { PHINode *PN = PHIsToRewrite.back().first; unsigned FieldNo = PHIsToRewrite.back().second; PHIsToRewrite.pop_back(); PHINode *FieldPN = cast(InsertedScalarizedValues[PN][FieldNo]); assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi"); // Add all the incoming values. This can materialize more phis. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *InVal = PN->getIncomingValue(i); InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues, PHIsToRewrite); FieldPN->addIncoming(InVal, PN->getIncomingBlock(i)); } } // Drop all inter-phi links and any loads that made it this far. for (DenseMap >::iterator I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); I != E; ++I) { if (PHINode *PN = dyn_cast(I->first)) PN->dropAllReferences(); else if (LoadInst *LI = dyn_cast(I->first)) LI->dropAllReferences(); } // Delete all the phis and loads now that inter-references are dead. for (DenseMap >::iterator I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); I != E; ++I) { if (PHINode *PN = dyn_cast(I->first)) PN->eraseFromParent(); else if (LoadInst *LI = dyn_cast(I->first)) LI->eraseFromParent(); } // The old global is now dead, remove it. GV->eraseFromParent(); ++NumHeapSRA; return cast(FieldGlobals[0]); } /// This function is called when we see a pointer global variable with a single /// value stored it that is a malloc or cast of malloc. static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, Type *AllocTy, AtomicOrdering Ordering, const DataLayout &DL, TargetLibraryInfo *TLI) { // If this is a malloc of an abstract type, don't touch it. if (!AllocTy->isSized()) return false; // We can't optimize this global unless all uses of it are *known* to be // of the malloc value, not of the null initializer value (consider a use // that compares the global's value against zero to see if the malloc has // been reached). To do this, we check to see if all uses of the global // would trap if the global were null: this proves that they must all // happen after the malloc. if (!AllUsesOfLoadedValueWillTrapIfNull(GV)) return false; // We can't optimize this if the malloc itself is used in a complex way, // for example, being stored into multiple globals. This allows the // malloc to be stored into the specified global, loaded icmp'd, and // GEP'd. These are all things we could transform to using the global // for. SmallPtrSet PHIs; if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs)) return false; // If we have a global that is only initialized with a fixed size malloc, // transform the program to use global memory instead of malloc'd memory. // This eliminates dynamic allocation, avoids an indirection accessing the // data, and exposes the resultant global to further GlobalOpt. // We cannot optimize the malloc if we cannot determine malloc array size. Value *NElems = getMallocArraySize(CI, DL, TLI, true); if (!NElems) return false; if (ConstantInt *NElements = dyn_cast(NElems)) // Restrict this transformation to only working on small allocations // (2048 bytes currently), as we don't want to introduce a 16M global or // something. if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) { OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI); return true; } // If the allocation is an array of structures, consider transforming this // into multiple malloc'd arrays, one for each field. This is basically // SRoA for malloc'd memory. if (Ordering != NotAtomic) return false; // If this is an allocation of a fixed size array of structs, analyze as a // variable size array. malloc [100 x struct],1 -> malloc struct, 100 if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1)) if (ArrayType *AT = dyn_cast(AllocTy)) AllocTy = AT->getElementType(); StructType *AllocSTy = dyn_cast(AllocTy); if (!AllocSTy) return false; // This the structure has an unreasonable number of fields, leave it // alone. if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 && AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) { // If this is a fixed size array, transform the Malloc to be an alloc of // structs. malloc [100 x struct],1 -> malloc struct, 100 if (ArrayType *AT = dyn_cast(getMallocAllocatedType(CI, TLI))) { Type *IntPtrTy = DL.getIntPtrType(CI->getType()); unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes(); Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements, nullptr, CI->getName()); Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); if (BitCastInst *BCI = dyn_cast(Malloc)) CI = cast(BCI->getOperand(0)); else CI = cast(Malloc); } PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL, TLI); return true; } return false; } // Try to optimize globals based on the knowledge that only one value (besides // its initializer) is ever stored to the global. static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, AtomicOrdering Ordering, const DataLayout &DL, TargetLibraryInfo *TLI) { // Ignore no-op GEPs and bitcasts. StoredOnceVal = StoredOnceVal->stripPointerCasts(); // If we are dealing with a pointer global that is initialized to null and // only has one (non-null) value stored into it, then we can optimize any // users of the loaded value (often calls and loads) that would trap if the // value was null. if (GV->getInitializer()->getType()->isPointerTy() && GV->getInitializer()->isNullValue()) { if (Constant *SOVC = dyn_cast(StoredOnceVal)) { if (GV->getInitializer()->getType() != SOVC->getType()) SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); // Optimize away any trapping uses of the loaded value. if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI)) return true; } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) { Type *MallocType = getMallocAllocatedType(CI, TLI); if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, DL, TLI)) return true; } } return false; } /// At this point, we have learned that the only two values ever stored into GV /// are its initializer and OtherVal. See if we can shrink the global into a /// boolean and select between the two values whenever it is used. This exposes /// the values to other scalar optimizations. static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { Type *GVElType = GV->getValueType(); // If GVElType is already i1, it is already shrunk. If the type of the GV is // an FP value, pointer or vector, don't do this optimization because a select // between them is very expensive and unlikely to lead to later // simplification. In these cases, we typically end up with "cond ? v1 : v2" // where v1 and v2 both require constant pool loads, a big loss. if (GVElType == Type::getInt1Ty(GV->getContext()) || GVElType->isFloatingPointTy() || GVElType->isPointerTy() || GVElType->isVectorTy()) return false; // Walk the use list of the global seeing if all the uses are load or store. // If there is anything else, bail out. for (User *U : GV->users()) if (!isa(U) && !isa(U)) return false; DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n"); // Create the new global, initializing it to false. GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), GV->getName()+".b", GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV); Constant *InitVal = GV->getInitializer(); assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && "No reason to shrink to bool!"); // If initialized to zero and storing one into the global, we can use a cast // instead of a select to synthesize the desired value. bool IsOneZero = false; if (ConstantInt *CI = dyn_cast(OtherVal)) IsOneZero = InitVal->isNullValue() && CI->isOne(); while (!GV->use_empty()) { Instruction *UI = cast(GV->user_back()); if (StoreInst *SI = dyn_cast(UI)) { // Change the store into a boolean store. bool StoringOther = SI->getOperand(0) == OtherVal; // Only do this if we weren't storing a loaded value. Value *StoreVal; if (StoringOther || SI->getOperand(0) == InitVal) { StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()), StoringOther); } else { // Otherwise, we are storing a previously loaded copy. To do this, // change the copy from copying the original value to just copying the // bool. Instruction *StoredVal = cast(SI->getOperand(0)); // If we've already replaced the input, StoredVal will be a cast or // select instruction. If not, it will be a load of the original // global. if (LoadInst *LI = dyn_cast(StoredVal)) { assert(LI->getOperand(0) == GV && "Not a copy!"); // Insert a new load, to preserve the saved value. StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0, LI->getOrdering(), LI->getSynchScope(), LI); } else { assert((isa(StoredVal) || isa(StoredVal)) && "This is not a form that we understand!"); StoreVal = StoredVal->getOperand(0); assert(isa(StoreVal) && "Not a load of NewGV!"); } } new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(), SI->getSynchScope(), SI); } else { // Change the load into a load of bool then a select. LoadInst *LI = cast(UI); LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0, LI->getOrdering(), LI->getSynchScope(), LI); Value *NSI; if (IsOneZero) NSI = new ZExtInst(NLI, LI->getType(), "", LI); else NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI); NSI->takeName(LI); LI->replaceAllUsesWith(NSI); } UI->eraseFromParent(); } // Retain the name of the old global variable. People who are debugging their // programs may expect these variables to be named the same. NewGV->takeName(GV); GV->eraseFromParent(); return true; } bool GlobalOpt::deleteIfDead(GlobalValue &GV) { GV.removeDeadConstantUsers(); if (!GV.isDiscardableIfUnused()) return false; if (const Comdat *C = GV.getComdat()) if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C)) return false; bool Dead; if (auto *F = dyn_cast(&GV)) Dead = F->isDefTriviallyDead(); else Dead = GV.use_empty(); if (!Dead) return false; DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n"); GV.eraseFromParent(); ++NumDeleted; return true; } /// Analyze the specified global variable and optimize it if possible. If we /// make a change, return true. bool GlobalOpt::processGlobal(GlobalValue &GV) { // Do more involved optimizations if the global is internal. if (!GV.hasLocalLinkage()) return false; GlobalStatus GS; if (GlobalStatus::analyzeGlobal(&GV, GS)) return false; bool Changed = false; if (!GS.IsCompared && !GV.hasUnnamedAddr()) { GV.setUnnamedAddr(true); NumUnnamed++; Changed = true; } auto *GVar = dyn_cast(&GV); if (!GVar) return Changed; if (GVar->isConstant() || !GVar->hasInitializer()) return Changed; return processInternalGlobal(GVar, GS) || Changed; } bool GlobalOpt::isPointerValueDeadOnEntryToFunction(const Function *F, GlobalValue *GV) { // Find all uses of GV. We expect them all to be in F, and if we can't // identify any of the uses we bail out. // // On each of these uses, identify if the memory that GV points to is // used/required/live at the start of the function. If it is not, for example // if the first thing the function does is store to the GV, the GV can // possibly be demoted. // // We don't do an exhaustive search for memory operations - simply look // through bitcasts as they're quite common and benign. const DataLayout &DL = GV->getParent()->getDataLayout(); SmallVector Loads; SmallVector Stores; for (auto *U : GV->users()) { if (Operator::getOpcode(U) == Instruction::BitCast) { for (auto *UU : U->users()) { if (auto *LI = dyn_cast(UU)) Loads.push_back(LI); else if (auto *SI = dyn_cast(UU)) Stores.push_back(SI); else return false; } continue; } Instruction *I = dyn_cast(U); if (!I) return false; assert(I->getParent()->getParent() == F); if (auto *LI = dyn_cast(I)) Loads.push_back(LI); else if (auto *SI = dyn_cast(I)) Stores.push_back(SI); else return false; } // We have identified all uses of GV into loads and stores. Now check if all // of them are known not to depend on the value of the global at the function // entry point. We do this by ensuring that every load is dominated by at // least one store. auto &DT = getAnalysis(*const_cast(F)) .getDomTree(); // The below check is quadratic. Check we're not going to do too many tests. // FIXME: Even though this will always have worst-case quadratic time, we // could put effort into minimizing the average time by putting stores that // have been shown to dominate at least one load at the beginning of the // Stores array, making subsequent dominance checks more likely to succeed // early. // // The threshold here is fairly large because global->local demotion is a // very powerful optimization should it fire. const unsigned Threshold = 100; if (Loads.size() * Stores.size() > Threshold) return false; for (auto *L : Loads) { auto *LTy = L->getType(); if (!std::any_of(Stores.begin(), Stores.end(), [&](StoreInst *S) { auto *STy = S->getValueOperand()->getType(); // The load is only dominated by the store if DomTree says so // and the number of bits loaded in L is less than or equal to // the number of bits stored in S. return DT.dominates(S, L) && DL.getTypeStoreSize(LTy) <= DL.getTypeStoreSize(STy); })) return false; } // All loads have known dependences inside F, so the global can be localized. return true; } /// C may have non-instruction users. Can all of those users be turned into /// instructions? static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) { // We don't do this exhaustively. The most common pattern that we really need // to care about is a constant GEP or constant bitcast - so just looking // through one single ConstantExpr. // // The set of constants that this function returns true for must be able to be // handled by makeAllConstantUsesInstructions. for (auto *U : C->users()) { if (isa(U)) continue; if (!isa(U)) // Non instruction, non-constantexpr user; cannot convert this. return false; for (auto *UU : U->users()) if (!isa(UU)) // A constantexpr used by another constant. We don't try and recurse any // further but just bail out at this point. return false; } return true; } /// C may have non-instruction users, and /// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the /// non-instruction users to instructions. static void makeAllConstantUsesInstructions(Constant *C) { SmallVector Users; for (auto *U : C->users()) { if (isa(U)) Users.push_back(cast(U)); else // We should never get here; allNonInstructionUsersCanBeMadeInstructions // should not have returned true for C. assert( isa(U) && "Can't transform non-constantexpr non-instruction to instruction!"); } SmallVector UUsers; for (auto *U : Users) { UUsers.clear(); for (auto *UU : U->users()) UUsers.push_back(UU); for (auto *UU : UUsers) { Instruction *UI = cast(UU); Instruction *NewU = U->getAsInstruction(); NewU->insertBefore(UI); UI->replaceUsesOfWith(U, NewU); } U->dropAllReferences(); } } /// Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. bool GlobalOpt::processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS) { auto &DL = GV->getParent()->getDataLayout(); // If this is a first class global and has only one accessing function and // this function is non-recursive, we replace the global with a local alloca // in this function. // // NOTE: It doesn't make sense to promote non-single-value types since we // are just replacing static memory to stack memory. // // If the global is in different address space, don't bring it to stack. if (!GS.HasMultipleAccessingFunctions && GS.AccessingFunction && GV->getValueType()->isSingleValueType() && GV->getType()->getAddressSpace() == 0 && !GV->isExternallyInitialized() && allNonInstructionUsersCanBeMadeInstructions(GV) && GS.AccessingFunction->doesNotRecurse() && isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV) ) { DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast(*GS.AccessingFunction ->getEntryBlock().begin()); Type *ElemTy = GV->getValueType(); // FIXME: Pass Global's alignment when globals have alignment AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr, GV->getName(), &FirstI); if (!isa(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); makeAllConstantUsesInstructions(GV); GV->replaceAllUsesWith(Alloca); GV->eraseFromParent(); ++NumLocalized; return true; } // If the global is never loaded (but may be stored to), it is dead. // Delete it now. if (!GS.IsLoaded) { DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n"); bool Changed; if (isLeakCheckerRoot(GV)) { // Delete any constant stores to the global. Changed = CleanupPointerRootUsers(GV, TLI); } else { // Delete any stores we can find to the global. We may not be able to // make it completely dead though. Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); } // If the global is dead now, delete it. if (GV->use_empty()) { GV->eraseFromParent(); ++NumDeleted; Changed = true; } return Changed; } else if (GS.StoredType <= GlobalStatus::InitializerStored) { DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n"); GV->setConstant(true); // Clean up any obviously simplifiable users now. CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); // If the global is dead now, just nuke it. if (GV->use_empty()) { DEBUG(dbgs() << " *** Marking constant allowed us to simplify " << "all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; } ++NumMarked; return true; } else if (!GV->getInitializer()->getType()->isSingleValueType()) { const DataLayout &DL = GV->getParent()->getDataLayout(); if (SRAGlobal(GV, DL)) return true; } else if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) { // If the initial value for the global was an undef value, and if only // one other value was stored into it, we can just change the // initializer to be the stored value, then delete all stores to the // global. This allows us to mark it constant. if (Constant *SOVConstant = dyn_cast(GS.StoredOnceValue)) if (isa(GV->getInitializer())) { // Change the initial value here. GV->setInitializer(SOVConstant); // Clean up any obviously simplifiable users now. CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); if (GV->use_empty()) { DEBUG(dbgs() << " *** Substituting initializer allowed us to " << "simplify all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; } ++NumSubstitute; return true; } // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a // boolean. if (Constant *SOVConstant = dyn_cast(GS.StoredOnceValue)) { if (GS.Ordering == NotAtomic) { if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { ++NumShrunkToBool; return true; } } } } return false; } /// Walk all of the direct calls of the specified function, changing them to /// FastCC. static void ChangeCalleesToFastCall(Function *F) { for (User *U : F->users()) { if (isa(U)) continue; CallSite CS(cast(U)); CS.setCallingConv(CallingConv::Fast); } } static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) { for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) { unsigned Index = Attrs.getSlotIndex(i); if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest)) continue; // There can be only one. return Attrs.removeAttribute(C, Index, Attribute::Nest); } return Attrs; } static void RemoveNestAttribute(Function *F) { F->setAttributes(StripNest(F->getContext(), F->getAttributes())); for (User *U : F->users()) { if (isa(U)) continue; CallSite CS(cast(U)); CS.setAttributes(StripNest(F->getContext(), CS.getAttributes())); } } /// Return true if this is a calling convention that we'd like to change. The /// idea here is that we don't want to mess with the convention if the user /// explicitly requested something with performance implications like coldcc, /// GHC, or anyregcc. static bool isProfitableToMakeFastCC(Function *F) { CallingConv::ID CC = F->getCallingConv(); // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc? return CC == CallingConv::C || CC == CallingConv::X86_ThisCall; } bool GlobalOpt::OptimizeFunctions(Module &M) { bool Changed = false; // Optimize functions. for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { Function *F = &*FI++; // Functions without names cannot be referenced outside this module. if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage()) F->setLinkage(GlobalValue::InternalLinkage); if (deleteIfDead(*F)) { Changed = true; continue; } Changed |= processGlobal(*F); if (!F->hasLocalLinkage()) continue; if (isProfitableToMakeFastCC(F) && !F->isVarArg() && !F->hasAddressTaken()) { // If this function has a calling convention worth changing, is not a // varargs function, and is only called directly, promote it to use the // Fast calling convention. F->setCallingConv(CallingConv::Fast); ChangeCalleesToFastCall(F); ++NumFastCallFns; Changed = true; } if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && !F->hasAddressTaken()) { // The function is not used by a trampoline intrinsic, so it is safe // to remove the 'nest' attribute. RemoveNestAttribute(F); ++NumNestRemoved; Changed = true; } } return Changed; } bool GlobalOpt::OptimizeGlobalVars(Module &M) { bool Changed = false; for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); GVI != E; ) { GlobalVariable *GV = &*GVI++; // Global variables without names cannot be referenced outside this module. if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage()) GV->setLinkage(GlobalValue::InternalLinkage); // Simplify the initializer. if (GV->hasInitializer()) if (ConstantExpr *CE = dyn_cast(GV->getInitializer())) { auto &DL = M.getDataLayout(); Constant *New = ConstantFoldConstantExpression(CE, DL, TLI); if (New && New != CE) GV->setInitializer(New); } if (deleteIfDead(*GV)) { Changed = true; continue; } Changed |= processGlobal(*GV); } return Changed; } static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSetImpl &SimpleConstants, const DataLayout &DL); /// Return true if the specified constant can be handled by the code generator. /// We don't want to generate something like: /// void *X = &X/42; /// because the code generator doesn't have a relocation that can handle that. /// /// This function should be called if C was not found (but just got inserted) /// in SimpleConstants to avoid having to rescan the same constants all the /// time. static bool isSimpleEnoughValueToCommitHelper(Constant *C, SmallPtrSetImpl &SimpleConstants, const DataLayout &DL) { // Simple global addresses are supported, do not allow dllimport or // thread-local globals. if (auto *GV = dyn_cast(C)) return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal(); // Simple integer, undef, constant aggregate zero, etc are all supported. if (C->getNumOperands() == 0 || isa(C)) return true; // Aggregate values are safe if all their elements are. if (isa(C) || isa(C) || isa(C)) { for (Value *Op : C->operands()) if (!isSimpleEnoughValueToCommit(cast(Op), SimpleConstants, DL)) return false; return true; } // We don't know exactly what relocations are allowed in constant expressions, // so we allow &global+constantoffset, which is safe and uniformly supported // across targets. ConstantExpr *CE = cast(C); switch (CE->getOpcode()) { case Instruction::BitCast: // Bitcast is fine if the casted value is fine. return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); case Instruction::IntToPtr: case Instruction::PtrToInt: // int <=> ptr is fine if the int type is the same size as the // pointer type. if (DL.getTypeSizeInBits(CE->getType()) != DL.getTypeSizeInBits(CE->getOperand(0)->getType())) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); // GEP is fine if it is simple + constant offset. case Instruction::GetElementPtr: for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i) if (!isa(CE->getOperand(i))) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); case Instruction::Add: // We allow simple+cst. if (!isa(CE->getOperand(1))) return false; return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL); } return false; } static inline bool isSimpleEnoughValueToCommit(Constant *C, SmallPtrSetImpl &SimpleConstants, const DataLayout &DL) { // If we already checked this constant, we win. if (!SimpleConstants.insert(C).second) return true; // Check the constant. return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL); } /// Return true if this constant is simple enough for us to understand. In /// particular, if it is a cast to anything other than from one pointer type to /// another pointer type, we punt. We basically just support direct accesses to /// globals and GEP's of globals. This should be kept up to date with /// CommitValueTo. static bool isSimpleEnoughPointerToCommit(Constant *C) { // Conservatively, avoid aggregate types. This is because we don't // want to worry about them partially overlapping other stores. if (!cast(C->getType())->getElementType()->isSingleValueType()) return false; if (GlobalVariable *GV = dyn_cast(C)) // Do not allow weak/*_odr/linkonce linkage or external globals. return GV->hasUniqueInitializer(); if (ConstantExpr *CE = dyn_cast(C)) { // Handle a constantexpr gep. if (CE->getOpcode() == Instruction::GetElementPtr && isa(CE->getOperand(0)) && cast(CE)->isInBounds()) { GlobalVariable *GV = cast(CE->getOperand(0)); // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or // external globals. if (!GV->hasUniqueInitializer()) return false; // The first index must be zero. ConstantInt *CI = dyn_cast(*std::next(CE->op_begin())); if (!CI || !CI->isZero()) return false; // The remaining indices must be compile-time known integers within the // notional bounds of the corresponding static array types. if (!CE->isGEPWithNoNotionalOverIndexing()) return false; return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); // A constantexpr bitcast from a pointer to another pointer is a no-op, // and we know how to evaluate it by moving the bitcast from the pointer // operand to the value operand. } else if (CE->getOpcode() == Instruction::BitCast && isa(CE->getOperand(0))) { // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or // external globals. return cast(CE->getOperand(0))->hasUniqueInitializer(); } } return false; } /// Evaluate a piece of a constantexpr store into a global initializer. This /// returns 'Init' modified to reflect 'Val' stored into it. At this point, the /// GEP operands of Addr [0, OpNo) have been stepped into. static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, ConstantExpr *Addr, unsigned OpNo) { // Base case of the recursion. if (OpNo == Addr->getNumOperands()) { assert(Val->getType() == Init->getType() && "Type mismatch!"); return Val; } SmallVector Elts; if (StructType *STy = dyn_cast(Init->getType())) { // Break up the constant into its elements. for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) Elts.push_back(Init->getAggregateElement(i)); // Replace the element that we are supposed to. ConstantInt *CU = cast(Addr->getOperand(OpNo)); unsigned Idx = CU->getZExtValue(); assert(Idx < STy->getNumElements() && "Struct index out of range!"); Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1); // Return the modified struct. return ConstantStruct::get(STy, Elts); } ConstantInt *CI = cast(Addr->getOperand(OpNo)); SequentialType *InitTy = cast(Init->getType()); uint64_t NumElts; if (ArrayType *ATy = dyn_cast(InitTy)) NumElts = ATy->getNumElements(); else NumElts = InitTy->getVectorNumElements(); // Break up the array into elements. for (uint64_t i = 0, e = NumElts; i != e; ++i) Elts.push_back(Init->getAggregateElement(i)); assert(CI->getZExtValue() < NumElts); Elts[CI->getZExtValue()] = EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1); if (Init->getType()->isArrayTy()) return ConstantArray::get(cast(InitTy), Elts); return ConstantVector::get(Elts); } /// We have decided that Addr (which satisfies the predicate /// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. static void CommitValueTo(Constant *Val, Constant *Addr) { if (GlobalVariable *GV = dyn_cast(Addr)) { assert(GV->hasInitializer()); GV->setInitializer(Val); return; } ConstantExpr *CE = cast(Addr); GlobalVariable *GV = cast(CE->getOperand(0)); GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2)); } namespace { /// This class evaluates LLVM IR, producing the Constant representing each SSA /// instruction. Changes to global variables are stored in a mapping that can /// be iterated over after the evaluation is complete. Once an evaluation call /// fails, the evaluation object should not be reused. class Evaluator { public: Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI) : DL(DL), TLI(TLI) { ValueStack.emplace_back(); } ~Evaluator() { for (auto &Tmp : AllocaTmps) // If there are still users of the alloca, the program is doing something // silly, e.g. storing the address of the alloca somewhere and using it // later. Since this is undefined, we'll just make it be null. if (!Tmp->use_empty()) Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType())); } /// Evaluate a call to function F, returning true if successful, false if we /// can't evaluate it. ActualArgs contains the formal arguments for the /// function. bool EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl &ActualArgs); /// Evaluate all instructions in block BB, returning true if successful, false /// if we can't evaluate it. NewBB returns the next BB that control flows /// into, or null upon return. bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB); Constant *getVal(Value *V) { if (Constant *CV = dyn_cast(V)) return CV; Constant *R = ValueStack.back().lookup(V); assert(R && "Reference to an uncomputed value!"); return R; } void setVal(Value *V, Constant *C) { ValueStack.back()[V] = C; } const DenseMap &getMutatedMemory() const { return MutatedMemory; } const SmallPtrSetImpl &getInvariants() const { return Invariants; } private: Constant *ComputeLoadResult(Constant *P); /// As we compute SSA register values, we store their contents here. The back /// of the deque contains the current function and the stack contains the /// values in the calling frames. std::deque> ValueStack; /// This is used to detect recursion. In pathological situations we could hit /// exponential behavior, but at least there is nothing unbounded. SmallVector CallStack; /// For each store we execute, we update this map. Loads check this to get /// the most up-to-date value. If evaluation is successful, this state is /// committed to the process. DenseMap MutatedMemory; /// To 'execute' an alloca, we create a temporary global variable to represent /// its body. This vector is needed so we can delete the temporary globals /// when we are done. SmallVector, 32> AllocaTmps; /// These global variables have been marked invariant by the static /// constructor. SmallPtrSet Invariants; /// These are constants we have checked and know to be simple enough to live /// in a static initializer of a global. SmallPtrSet SimpleConstants; const DataLayout &DL; const TargetLibraryInfo *TLI; }; } // anonymous namespace /// Return the value that would be computed by a load from P after the stores /// reflected by 'memory' have been performed. If we can't decide, return null. Constant *Evaluator::ComputeLoadResult(Constant *P) { // If this memory location has been recently stored, use the stored value: it // is the most up-to-date. DenseMap::const_iterator I = MutatedMemory.find(P); if (I != MutatedMemory.end()) return I->second; // Access it. if (GlobalVariable *GV = dyn_cast(P)) { if (GV->hasDefinitiveInitializer()) return GV->getInitializer(); return nullptr; } // Handle a constantexpr getelementptr. if (ConstantExpr *CE = dyn_cast(P)) if (CE->getOpcode() == Instruction::GetElementPtr && isa(CE->getOperand(0))) { GlobalVariable *GV = cast(CE->getOperand(0)); if (GV->hasDefinitiveInitializer()) return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); } return nullptr; // don't know how to evaluate. } /// Evaluate all instructions in block BB, returning true if successful, false /// if we can't evaluate it. NewBB returns the next BB that control flows into, /// or null upon return. bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB) { // This is the main evaluation loop. while (1) { Constant *InstResult = nullptr; DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n"); if (StoreInst *SI = dyn_cast(CurInst)) { if (!SI->isSimple()) { DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n"); return false; // no volatile/atomic accesses. } Constant *Ptr = getVal(SI->getOperand(1)); if (ConstantExpr *CE = dyn_cast(Ptr)) { DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr); Ptr = ConstantFoldConstantExpression(CE, DL, TLI); DEBUG(dbgs() << "; To: " << *Ptr << "\n"); } if (!isSimpleEnoughPointerToCommit(Ptr)) { // If this is too complex for us to commit, reject it. DEBUG(dbgs() << "Pointer is too complex for us to evaluate store."); return false; } Constant *Val = getVal(SI->getOperand(0)); // If this might be too difficult for the backend to handle (e.g. the addr // of one global variable divided by another) then we can't commit it. if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) { DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val << "\n"); return false; } if (ConstantExpr *CE = dyn_cast(Ptr)) { if (CE->getOpcode() == Instruction::BitCast) { DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n"); // If we're evaluating a store through a bitcast, then we need // to pull the bitcast off the pointer type and push it onto the // stored value. Ptr = CE->getOperand(0); Type *NewTy = cast(Ptr->getType())->getElementType(); // In order to push the bitcast onto the stored value, a bitcast // from NewTy to Val's type must be legal. If it's not, we can try // introspecting NewTy to find a legal conversion. while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) { // If NewTy is a struct, we can convert the pointer to the struct // into a pointer to its first member. // FIXME: This could be extended to support arrays as well. if (StructType *STy = dyn_cast(NewTy)) { NewTy = STy->getTypeAtIndex(0U); IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32); Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); Constant * const IdxList[] = {IdxZero, IdxZero}; Ptr = ConstantExpr::getGetElementPtr(nullptr, Ptr, IdxList); if (ConstantExpr *CE = dyn_cast(Ptr)) Ptr = ConstantFoldConstantExpression(CE, DL, TLI); // If we can't improve the situation by introspecting NewTy, // we have to give up. } else { DEBUG(dbgs() << "Failed to bitcast constant ptr, can not " "evaluate.\n"); return false; } } // If we found compatible types, go ahead and push the bitcast // onto the stored value. Val = ConstantExpr::getBitCast(Val, NewTy); DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n"); } } MutatedMemory[Ptr] = Val; } else if (BinaryOperator *BO = dyn_cast(CurInst)) { InstResult = ConstantExpr::get(BO->getOpcode(), getVal(BO->getOperand(0)), getVal(BO->getOperand(1))); DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult << "\n"); } else if (CmpInst *CI = dyn_cast(CurInst)) { InstResult = ConstantExpr::getCompare(CI->getPredicate(), getVal(CI->getOperand(0)), getVal(CI->getOperand(1))); DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult << "\n"); } else if (CastInst *CI = dyn_cast(CurInst)) { InstResult = ConstantExpr::getCast(CI->getOpcode(), getVal(CI->getOperand(0)), CI->getType()); DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult << "\n"); } else if (SelectInst *SI = dyn_cast(CurInst)) { InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)), getVal(SI->getOperand(1)), getVal(SI->getOperand(2))); DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult << "\n"); } else if (auto *EVI = dyn_cast(CurInst)) { InstResult = ConstantExpr::getExtractValue( getVal(EVI->getAggregateOperand()), EVI->getIndices()); DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult << "\n"); } else if (auto *IVI = dyn_cast(CurInst)) { InstResult = ConstantExpr::getInsertValue( getVal(IVI->getAggregateOperand()), getVal(IVI->getInsertedValueOperand()), IVI->getIndices()); DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult << "\n"); } else if (GetElementPtrInst *GEP = dyn_cast(CurInst)) { Constant *P = getVal(GEP->getOperand(0)); SmallVector GEPOps; for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e; ++i) GEPOps.push_back(getVal(*i)); InstResult = ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps, cast(GEP)->isInBounds()); DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n"); } else if (LoadInst *LI = dyn_cast(CurInst)) { if (!LI->isSimple()) { DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n"); return false; // no volatile/atomic accesses. } Constant *Ptr = getVal(LI->getOperand(0)); if (ConstantExpr *CE = dyn_cast(Ptr)) { Ptr = ConstantFoldConstantExpression(CE, DL, TLI); DEBUG(dbgs() << "Found a constant pointer expression, constant " "folding: " << *Ptr << "\n"); } InstResult = ComputeLoadResult(Ptr); if (!InstResult) { DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load." "\n"); return false; // Could not evaluate load. } DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n"); } else if (AllocaInst *AI = dyn_cast(CurInst)) { if (AI->isArrayAllocation()) { DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n"); return false; // Cannot handle array allocs. } - Type *Ty = AI->getType()->getElementType(); + Type *Ty = AI->getAllocatedType(); AllocaTmps.push_back( make_unique(Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty), AI->getName())); InstResult = AllocaTmps.back().get(); DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n"); } else if (isa(CurInst) || isa(CurInst)) { CallSite CS(&*CurInst); // Debug info can safely be ignored here. if (isa(CS.getInstruction())) { DEBUG(dbgs() << "Ignoring debug info.\n"); ++CurInst; continue; } // Cannot handle inline asm. if (isa(CS.getCalledValue())) { DEBUG(dbgs() << "Found inline asm, can not evaluate.\n"); return false; } if (IntrinsicInst *II = dyn_cast(CS.getInstruction())) { if (MemSetInst *MSI = dyn_cast(II)) { if (MSI->isVolatile()) { DEBUG(dbgs() << "Can not optimize a volatile memset " << "intrinsic.\n"); return false; } Constant *Ptr = getVal(MSI->getDest()); Constant *Val = getVal(MSI->getValue()); Constant *DestVal = ComputeLoadResult(getVal(Ptr)); if (Val->isNullValue() && DestVal && DestVal->isNullValue()) { // This memset is a no-op. DEBUG(dbgs() << "Ignoring no-op memset.\n"); ++CurInst; continue; } } if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n"); ++CurInst; continue; } if (II->getIntrinsicID() == Intrinsic::invariant_start) { // We don't insert an entry into Values, as it doesn't have a // meaningful return value. if (!II->use_empty()) { DEBUG(dbgs() << "Found unused invariant_start. Can't evaluate.\n"); return false; } ConstantInt *Size = cast(II->getArgOperand(0)); Value *PtrArg = getVal(II->getArgOperand(1)); Value *Ptr = PtrArg->stripPointerCasts(); if (GlobalVariable *GV = dyn_cast(Ptr)) { Type *ElemTy = GV->getValueType(); if (!Size->isAllOnesValue() && Size->getValue().getLimitedValue() >= DL.getTypeStoreSize(ElemTy)) { Invariants.insert(GV); DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV << "\n"); } else { DEBUG(dbgs() << "Found a global var, but can not treat it as an " "invariant.\n"); } } // Continue even if we do nothing. ++CurInst; continue; } else if (II->getIntrinsicID() == Intrinsic::assume) { DEBUG(dbgs() << "Skipping assume intrinsic.\n"); ++CurInst; continue; } DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n"); return false; } // Resolve function pointers. Function *Callee = dyn_cast(getVal(CS.getCalledValue())); if (!Callee || Callee->mayBeOverridden()) { DEBUG(dbgs() << "Can not resolve function pointer.\n"); return false; // Cannot resolve. } SmallVector Formals; for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) Formals.push_back(getVal(*i)); if (Callee->isDeclaration()) { // If this is a function we can constant fold, do it. if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) { InstResult = C; DEBUG(dbgs() << "Constant folded function call. Result: " << *InstResult << "\n"); } else { DEBUG(dbgs() << "Can not constant fold function call.\n"); return false; } } else { if (Callee->getFunctionType()->isVarArg()) { DEBUG(dbgs() << "Can not constant fold vararg function call.\n"); return false; } Constant *RetVal = nullptr; // Execute the call, if successful, use the return value. ValueStack.emplace_back(); if (!EvaluateFunction(Callee, RetVal, Formals)) { DEBUG(dbgs() << "Failed to evaluate function.\n"); return false; } ValueStack.pop_back(); InstResult = RetVal; if (InstResult) { DEBUG(dbgs() << "Successfully evaluated function. Result: " << InstResult << "\n\n"); } else { DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n"); } } } else if (isa(CurInst)) { DEBUG(dbgs() << "Found a terminator instruction.\n"); if (BranchInst *BI = dyn_cast(CurInst)) { if (BI->isUnconditional()) { NextBB = BI->getSuccessor(0); } else { ConstantInt *Cond = dyn_cast(getVal(BI->getCondition())); if (!Cond) return false; // Cannot determine. NextBB = BI->getSuccessor(!Cond->getZExtValue()); } } else if (SwitchInst *SI = dyn_cast(CurInst)) { ConstantInt *Val = dyn_cast(getVal(SI->getCondition())); if (!Val) return false; // Cannot determine. NextBB = SI->findCaseValue(Val).getCaseSuccessor(); } else if (IndirectBrInst *IBI = dyn_cast(CurInst)) { Value *Val = getVal(IBI->getAddress())->stripPointerCasts(); if (BlockAddress *BA = dyn_cast(Val)) NextBB = BA->getBasicBlock(); else return false; // Cannot determine. } else if (isa(CurInst)) { NextBB = nullptr; } else { // invoke, unwind, resume, unreachable. DEBUG(dbgs() << "Can not handle terminator."); return false; // Cannot handle this terminator. } // We succeeded at evaluating this block! DEBUG(dbgs() << "Successfully evaluated block.\n"); return true; } else { // Did not know how to evaluate this! DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction." "\n"); return false; } if (!CurInst->use_empty()) { if (ConstantExpr *CE = dyn_cast(InstResult)) InstResult = ConstantFoldConstantExpression(CE, DL, TLI); setVal(&*CurInst, InstResult); } // If we just processed an invoke, we finished evaluating the block. if (InvokeInst *II = dyn_cast(CurInst)) { NextBB = II->getNormalDest(); DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n"); return true; } // Advance program counter. ++CurInst; } } /// Evaluate a call to function F, returning true if successful, false if we /// can't evaluate it. ActualArgs contains the formal arguments for the /// function. bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal, const SmallVectorImpl &ActualArgs) { // Check to see if this function is already executing (recursion). If so, // bail out. TODO: we might want to accept limited recursion. if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end()) return false; CallStack.push_back(F); // Initialize arguments to the incoming values specified. unsigned ArgNo = 0; for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI, ++ArgNo) setVal(&*AI, ActualArgs[ArgNo]); // ExecutedBlocks - We only handle non-looping, non-recursive code. As such, // we can only evaluate any one basic block at most once. This set keeps // track of what we have executed so we can detect recursive cases etc. SmallPtrSet ExecutedBlocks; // CurBB - The current basic block we're evaluating. BasicBlock *CurBB = &F->front(); BasicBlock::iterator CurInst = CurBB->begin(); while (1) { BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings. DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n"); if (!EvaluateBlock(CurInst, NextBB)) return false; if (!NextBB) { // Successfully running until there's no next block means that we found // the return. Fill it the return value and pop the call stack. ReturnInst *RI = cast(CurBB->getTerminator()); if (RI->getNumOperands()) RetVal = getVal(RI->getOperand(0)); CallStack.pop_back(); return true; } // Okay, we succeeded in evaluating this control flow. See if we have // executed the new block before. If so, we have a looping function, // which we cannot evaluate in reasonable time. if (!ExecutedBlocks.insert(NextBB).second) return false; // looped! // Okay, we have never been in this block before. Check to see if there // are any PHI nodes. If so, evaluate them with information about where // we came from. PHINode *PN = nullptr; for (CurInst = NextBB->begin(); (PN = dyn_cast(CurInst)); ++CurInst) setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB))); // Advance to the next block. CurBB = NextBB; } } /// Evaluate static constructors in the function, if we can. Return true if we /// can, false otherwise. static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, const TargetLibraryInfo *TLI) { // Call the function. Evaluator Eval(DL, TLI); Constant *RetValDummy; bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy, SmallVector()); if (EvalSuccess) { ++NumCtorsEvaluated; // We succeeded at evaluation: commit the result. DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" << F->getName() << "' to " << Eval.getMutatedMemory().size() << " stores.\n"); for (DenseMap::const_iterator I = Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end(); I != E; ++I) CommitValueTo(I->second, I->first); for (GlobalVariable *GV : Eval.getInvariants()) GV->setConstant(true); } return EvalSuccess; } static int compareNames(Constant *const *A, Constant *const *B) { return (*A)->stripPointerCasts()->getName().compare( (*B)->stripPointerCasts()->getName()); } static void setUsedInitializer(GlobalVariable &V, const SmallPtrSet &Init) { if (Init.empty()) { V.eraseFromParent(); return; } // Type of pointer to the array of pointers. PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0); SmallVector UsedArray; for (GlobalValue *GV : Init) { Constant *Cast = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy); UsedArray.push_back(Cast); } // Sort to get deterministic order. array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames); ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size()); Module *M = V.getParent(); V.removeFromParent(); GlobalVariable *NV = new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage, llvm::ConstantArray::get(ATy, UsedArray), ""); NV->takeName(&V); NV->setSection("llvm.metadata"); delete &V; } namespace { /// An easy to access representation of llvm.used and llvm.compiler.used. class LLVMUsed { SmallPtrSet Used; SmallPtrSet CompilerUsed; GlobalVariable *UsedV; GlobalVariable *CompilerUsedV; public: LLVMUsed(Module &M) { UsedV = collectUsedGlobalVariables(M, Used, false); CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true); } typedef SmallPtrSet::iterator iterator; typedef iterator_range used_iterator_range; iterator usedBegin() { return Used.begin(); } iterator usedEnd() { return Used.end(); } used_iterator_range used() { return used_iterator_range(usedBegin(), usedEnd()); } iterator compilerUsedBegin() { return CompilerUsed.begin(); } iterator compilerUsedEnd() { return CompilerUsed.end(); } used_iterator_range compilerUsed() { return used_iterator_range(compilerUsedBegin(), compilerUsedEnd()); } bool usedCount(GlobalValue *GV) const { return Used.count(GV); } bool compilerUsedCount(GlobalValue *GV) const { return CompilerUsed.count(GV); } bool usedErase(GlobalValue *GV) { return Used.erase(GV); } bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); } bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; } bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV).second; } void syncVariablesAndSets() { if (UsedV) setUsedInitializer(*UsedV, Used); if (CompilerUsedV) setUsedInitializer(*CompilerUsedV, CompilerUsed); } }; } static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) { if (GA.use_empty()) // No use at all. return false; assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) && "We should have removed the duplicated " "element from llvm.compiler.used"); if (!GA.hasOneUse()) // Strictly more than one use. So at least one is not in llvm.used and // llvm.compiler.used. return true; // Exactly one use. Check if it is in llvm.used or llvm.compiler.used. return !U.usedCount(&GA) && !U.compilerUsedCount(&GA); } static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V, const LLVMUsed &U) { unsigned N = 2; assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) && "We should have removed the duplicated " "element from llvm.compiler.used"); if (U.usedCount(&V) || U.compilerUsedCount(&V)) ++N; return V.hasNUsesOrMore(N); } static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) { if (!GA.hasLocalLinkage()) return true; return U.usedCount(&GA) || U.compilerUsedCount(&GA); } static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U, bool &RenameTarget) { RenameTarget = false; bool Ret = false; if (hasUseOtherThanLLVMUsed(GA, U)) Ret = true; // If the alias is externally visible, we may still be able to simplify it. if (!mayHaveOtherReferences(GA, U)) return Ret; // If the aliasee has internal linkage, give it the name and linkage // of the alias, and delete the alias. This turns: // define internal ... @f(...) // @a = alias ... @f // into: // define ... @a(...) Constant *Aliasee = GA.getAliasee(); GlobalValue *Target = cast(Aliasee->stripPointerCasts()); if (!Target->hasLocalLinkage()) return Ret; // Do not perform the transform if multiple aliases potentially target the // aliasee. This check also ensures that it is safe to replace the section // and other attributes of the aliasee with those of the alias. if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U)) return Ret; RenameTarget = true; return true; } bool GlobalOpt::OptimizeGlobalAliases(Module &M) { bool Changed = false; LLVMUsed Used(M); for (GlobalValue *GV : Used.used()) Used.compilerUsedErase(GV); for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;) { GlobalAlias *J = &*I++; // Aliases without names cannot be referenced outside this module. if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage()) J->setLinkage(GlobalValue::InternalLinkage); if (deleteIfDead(*J)) { Changed = true; continue; } // If the aliasee may change at link time, nothing can be done - bail out. if (J->mayBeOverridden()) continue; Constant *Aliasee = J->getAliasee(); GlobalValue *Target = dyn_cast(Aliasee->stripPointerCasts()); // We can't trivially replace the alias with the aliasee if the aliasee is // non-trivial in some way. // TODO: Try to handle non-zero GEPs of local aliasees. if (!Target) continue; Target->removeDeadConstantUsers(); // Make all users of the alias use the aliasee instead. bool RenameTarget; if (!hasUsesToReplace(*J, Used, RenameTarget)) continue; J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType())); ++NumAliasesResolved; Changed = true; if (RenameTarget) { // Give the aliasee the name, linkage and other attributes of the alias. Target->takeName(&*J); Target->setLinkage(J->getLinkage()); Target->setVisibility(J->getVisibility()); Target->setDLLStorageClass(J->getDLLStorageClass()); if (Used.usedErase(&*J)) Used.usedInsert(Target); if (Used.compilerUsedErase(&*J)) Used.compilerUsedInsert(Target); } else if (mayHaveOtherReferences(*J, Used)) continue; // Delete the alias. M.getAliasList().erase(J); ++NumAliasesRemoved; Changed = true; } Used.syncVariablesAndSets(); return Changed; } static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc::cxa_atexit)) return nullptr; Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit)); if (!Fn) return nullptr; FunctionType *FTy = Fn->getFunctionType(); // Checking that the function has the right return type, the right number of // parameters and that they all have pointer types should be enough. if (!FTy->getReturnType()->isIntegerTy() || FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy() || !FTy->getParamType(1)->isPointerTy() || !FTy->getParamType(2)->isPointerTy()) return nullptr; return Fn; } /// Returns whether the given function is an empty C++ destructor and can /// therefore be eliminated. /// Note that we assume that other optimization passes have already simplified /// the code so we only look for a function with a single basic block, where /// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and /// other side-effect free instructions. static bool cxxDtorIsEmpty(const Function &Fn, SmallPtrSet &CalledFunctions) { // FIXME: We could eliminate C++ destructors if they're readonly/readnone and // nounwind, but that doesn't seem worth doing. if (Fn.isDeclaration()) return false; if (++Fn.begin() != Fn.end()) return false; const BasicBlock &EntryBlock = Fn.getEntryBlock(); for (BasicBlock::const_iterator I = EntryBlock.begin(), E = EntryBlock.end(); I != E; ++I) { if (const CallInst *CI = dyn_cast(I)) { // Ignore debug intrinsics. if (isa(CI)) continue; const Function *CalledFn = CI->getCalledFunction(); if (!CalledFn) return false; SmallPtrSet NewCalledFunctions(CalledFunctions); // Don't treat recursive functions as empty. if (!NewCalledFunctions.insert(CalledFn).second) return false; if (!cxxDtorIsEmpty(*CalledFn, NewCalledFunctions)) return false; } else if (isa(*I)) return true; // We're done. else if (I->mayHaveSideEffects()) return false; // Destructor with side effects, bail. } return false; } bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { /// Itanium C++ ABI p3.3.5: /// /// After constructing a global (or local static) object, that will require /// destruction on exit, a termination function is registered as follows: /// /// extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d ); /// /// This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the /// call f(p) when DSO d is unloaded, before all such termination calls /// registered before this one. It returns zero if registration is /// successful, nonzero on failure. // This pass will look for calls to __cxa_atexit where the function is trivial // and remove them. bool Changed = false; for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end(); I != E;) { // We're only interested in calls. Theoretically, we could handle invoke // instructions as well, but neither llvm-gcc nor clang generate invokes // to __cxa_atexit. CallInst *CI = dyn_cast(*I++); if (!CI) continue; Function *DtorFn = dyn_cast(CI->getArgOperand(0)->stripPointerCasts()); if (!DtorFn) continue; SmallPtrSet CalledFunctions; if (!cxxDtorIsEmpty(*DtorFn, CalledFunctions)) continue; // Just remove the call. CI->replaceAllUsesWith(Constant::getNullValue(CI->getType())); CI->eraseFromParent(); ++NumCXXDtorsRemoved; Changed |= true; } return Changed; } bool GlobalOpt::runOnModule(Module &M) { bool Changed = false; auto &DL = M.getDataLayout(); TLI = &getAnalysis().getTLI(); bool LocalChange = true; while (LocalChange) { LocalChange = false; NotDiscardableComdats.clear(); for (const GlobalVariable &GV : M.globals()) if (const Comdat *C = GV.getComdat()) if (!GV.isDiscardableIfUnused() || !GV.use_empty()) NotDiscardableComdats.insert(C); for (Function &F : M) if (const Comdat *C = F.getComdat()) if (!F.isDefTriviallyDead()) NotDiscardableComdats.insert(C); for (GlobalAlias &GA : M.aliases()) if (const Comdat *C = GA.getComdat()) if (!GA.isDiscardableIfUnused() || !GA.use_empty()) NotDiscardableComdats.insert(C); // Delete functions that are trivially dead, ccc -> fastcc LocalChange |= OptimizeFunctions(M); // Optimize global_ctors list. LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { return EvaluateStaticConstructor(F, DL, TLI); }); // Optimize non-address-taken globals. LocalChange |= OptimizeGlobalVars(M); // Resolve aliases, when possible. LocalChange |= OptimizeGlobalAliases(M); // Try to remove trivial global destructors if they are not removed // already. Function *CXAAtExitFn = FindCXAAtExit(M, TLI); if (CXAAtExitFn) LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn); Changed |= LocalChange; } // TODO: Move all global ctors functions to the end of the module for code // layout. return Changed; } diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index d77d5745e60c..a1c9dc103253 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1,2915 +1,2914 @@ //===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Rewrite an existing set of gc.statepoints such that they make potential // relocations performed by the garbage collector explicit in the IR. // //===----------------------------------------------------------------------===// #include "llvm/Pass.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/MapVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Debug.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #define DEBUG_TYPE "rewrite-statepoints-for-gc" using namespace llvm; // Print the liveset found at the insert location static cl::opt PrintLiveSet("spp-print-liveset", cl::Hidden, cl::init(false)); static cl::opt PrintLiveSetSize("spp-print-liveset-size", cl::Hidden, cl::init(false)); // Print out the base pointers for debugging static cl::opt PrintBasePointers("spp-print-base-pointers", cl::Hidden, cl::init(false)); // Cost threshold measuring when it is profitable to rematerialize value instead // of relocating it static cl::opt RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden, cl::init(6)); #ifdef XDEBUG static bool ClobberNonLive = true; #else static bool ClobberNonLive = false; #endif static cl::opt ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); static cl::opt UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, cl::init(false)); static cl::opt AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", cl::Hidden, cl::init(true)); /// Should we split vectors of pointers into their individual elements? This /// is known to be buggy, but the alternate implementation isn't yet ready. /// This is purely to provide a debugging and dianostic hook until the vector /// split is replaced with vector relocations. static cl::opt UseVectorSplit("rs4gc-split-vector-values", cl::Hidden, cl::init(true)); namespace { struct RewriteStatepointsForGC : public ModulePass { static char ID; // Pass identification, replacement for typeid RewriteStatepointsForGC() : ModulePass(ID) { initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F); bool runOnModule(Module &M) override { bool Changed = false; for (Function &F : M) Changed |= runOnFunction(F); if (Changed) { // stripNonValidAttributes asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. stripNonValidAttributes(M); } return Changed; } void getAnalysisUsage(AnalysisUsage &AU) const override { // We add and rewrite a bunch of instructions, but don't really do much // else. We could in theory preserve a lot more analyses here. AU.addRequired(); AU.addRequired(); } /// The IR fed into RewriteStatepointsForGC may have had attributes implying /// dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire /// heap. stripNonValidAttributes (conservatively) restores correctness /// by erasing all attributes in the module that externally imply /// dereferenceability. /// Similar reasoning also applies to the noalias attributes. gc.statepoint /// can touch the entire heap including noalias objects. void stripNonValidAttributes(Module &M); // Helpers for stripNonValidAttributes void stripNonValidAttributesFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); }; } // namespace char RewriteStatepointsForGC::ID = 0; ModulePass *llvm::createRewriteStatepointsForGCPass() { return new RewriteStatepointsForGC(); } INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc", "Make relocations explicit at statepoints", false, false) namespace { struct GCPtrLivenessData { /// Values defined in this block. DenseMap> KillSet; /// Values used in this block (and thus live); does not included values /// killed within this block. DenseMap> LiveSet; /// Values live into this basic block (i.e. used by any /// instruction in this basic block or ones reachable from here) DenseMap> LiveIn; /// Values live out of this basic block (i.e. live into /// any successor block) DenseMap> LiveOut; }; // The type of the internal cache used inside the findBasePointers family // of functions. From the callers perspective, this is an opaque type and // should not be inspected. // // In the actual implementation this caches two relations: // - The base relation itself (i.e. this pointer is based on that one) // - The base defining value relation (i.e. before base_phi insertion) // Generally, after the execution of a full findBasePointer call, only the // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type typedef DenseMap DefiningValueMapTy; typedef DenseSet StatepointLiveSetTy; typedef DenseMap, AssertingVH> RematerializedValueMapTy; struct PartiallyConstructedSafepointRecord { /// The set of values known to be live across this safepoint StatepointLiveSetTy LiveSet; /// Mapping from live pointers to a base-defining-value DenseMap PointerToBase; /// The *new* gc.statepoint instruction itself. This produces the token /// that normal path gc.relocates and the gc.result are tied to. Instruction *StatepointToken; /// Instruction to which exceptional gc relocates are attached /// Makes it easier to iterate through them during relocationViaAlloca. Instruction *UnwindToken; /// Record live values we are rematerialized instead of relocating. /// They are not included into 'LiveSet' field. /// Maps rematerialized copy to it's original value. RematerializedValueMapTy RematerializedValues; }; } static ArrayRef GetDeoptBundleOperands(ImmutableCallSite CS) { assert(UseDeoptBundles && "Should not be called otherwise!"); Optional DeoptBundle = CS.getOperandBundle("deopt"); if (!DeoptBundle.hasValue()) { assert(AllowStatepointWithNoDeoptInfo && "Found non-leaf call without deopt info!"); return None; } return DeoptBundle.getValue().Inputs; } /// Compute the live-in set for every basic block in the function static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data); /// Given results from the dataflow liveness computation, find the set of live /// Values at a particular instruction. static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, StatepointLiveSetTy &out); // TODO: Once we can get to the GCStrategy, this becomes // Optional isGCManagedPointer(const Type *Ty) const override { static bool isGCPointerType(Type *T) { if (auto *PT = dyn_cast(T)) // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. return (1 == PT->getAddressSpace()); return false; } // Return true if this type is one which a) is a gc pointer or contains a GC // pointer and b) is of a type this code expects to encounter as a live value. // (The insertion code will assert that a type which matches (a) and not (b) // is not encountered.) static bool isHandledGCPointerType(Type *T) { // We fully support gc pointers if (isGCPointerType(T)) return true; // We partially support vectors of gc pointers. The code will assert if it // can't handle something. if (auto VT = dyn_cast(T)) if (isGCPointerType(VT->getElementType())) return true; return false; } #ifndef NDEBUG /// Returns true if this type contains a gc pointer whether we know how to /// handle that type or not. static bool containsGCPtrType(Type *Ty) { if (isGCPointerType(Ty)) return true; if (VectorType *VT = dyn_cast(Ty)) return isGCPointerType(VT->getScalarType()); if (ArrayType *AT = dyn_cast(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast(Ty)) return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), containsGCPtrType); return false; } // Returns true if this is a type which a) is a gc pointer or contains a GC // pointer and b) is of a type which the code doesn't expect (i.e. first class // aggregates). Used to trip assertions. static bool isUnhandledGCPointerType(Type *Ty) { return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty); } #endif static bool order_by_name(Value *a, Value *b) { if (a->hasName() && b->hasName()) { return -1 == a->getName().compare(b->getName()); } else if (a->hasName() && !b->hasName()) { return true; } else if (!a->hasName() && b->hasName()) { return false; } else { // Better than nothing, but not stable return a < b; } } // Return the name of the value suffixed with the provided value, or if the // value didn't have a name, the default value specified. static std::string suffixed_name_or(Value *V, StringRef Suffix, StringRef DefaultName) { return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); } // Conservatively identifies any definitions which might be live at the // given instruction. The analysis is performed immediately before the // given instruction. Values defined by that instruction are not considered // live. Values used by that instruction are considered live. static void analyzeParsePointLiveness( DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, const CallSite &CS, PartiallyConstructedSafepointRecord &result) { Instruction *inst = CS.getInstruction(); StatepointLiveSetTy LiveSet; findLiveSetAtInst(inst, OriginalLivenessData, LiveSet); if (PrintLiveSet) { // Note: This output is used by several of the test cases // The order of elements in a set is not stable, put them in a vec and sort // by name SmallVector Temp; Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); std::sort(Temp.begin(), Temp.end(), order_by_name); errs() << "Live Variables:\n"; for (Value *V : Temp) dbgs() << " " << V->getName() << " " << *V << "\n"; } if (PrintLiveSetSize) { errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; errs() << "Number live values: " << LiveSet.size() << "\n"; } result.LiveSet = LiveSet; } static bool isKnownBaseResult(Value *V); namespace { /// A single base defining value - An immediate base defining value for an /// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. /// For instructions which have multiple pointer [vector] inputs or that /// transition between vector and scalar types, there is no immediate base /// defining value. The 'base defining value' for 'Def' is the transitive /// closure of this relation stopping at the first instruction which has no /// immediate base defining value. The b.d.v. might itself be a base pointer, /// but it can also be an arbitrary derived pointer. struct BaseDefiningValueResult { /// Contains the value which is the base defining value. Value * const BDV; /// True if the base defining value is also known to be an actual base /// pointer. const bool IsKnownBase; BaseDefiningValueResult(Value *BDV, bool IsKnownBase) : BDV(BDV), IsKnownBase(IsKnownBase) { #ifndef NDEBUG // Check consistency between new and old means of checking whether a BDV is // a base. bool MustBeBase = isKnownBaseResult(BDV); assert(!MustBeBase || MustBeBase == IsKnownBase); #endif } }; } static BaseDefiningValueResult findBaseDefiningValue(Value *I); /// Return a base defining value for the 'Index' element of the given vector /// instruction 'I'. If Index is null, returns a BDV for the entire vector /// 'I'. As an optimization, this method will try to determine when the /// element is known to already be a base pointer. If this can be established, /// the second value in the returned pair will be true. Note that either a /// vector or a pointer typed value can be returned. For the former, the /// vector returned is a BDV (and possibly a base) of the entire vector 'I'. /// If the later, the return pointer is a BDV (or possibly a base) for the /// particular element in 'I'. static BaseDefiningValueResult findBaseDefiningValueOfVector(Value *I) { // Each case parallels findBaseDefiningValue below, see that code for // detailed motivation. if (isa(I)) // An incoming argument to the function is a base pointer return BaseDefiningValueResult(I, true); if (isa(I)) // Constant vectors consist only of constant pointers. return BaseDefiningValueResult(I, true); if (isa(I)) return BaseDefiningValueResult(I, true); if (isa(I)) // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. return BaseDefiningValueResult(I, false); if (isa(I)) // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. // TODO: There a number of local optimizations which could be applied here // for particular sufflevector patterns. return BaseDefiningValueResult(I, false); // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa(I) || isa(I)) && "unknown vector instruction - no base found for vector element"); return BaseDefiningValueResult(I, false); } /// Helper function for findBasePointer - Will return a value which either a) /// defines the base pointer for the input, b) blocks the simple search /// (i.e. a PHI or Select of two derived pointers), or c) involves a change /// from pointer to vector type or back. static BaseDefiningValueResult findBaseDefiningValue(Value *I) { assert(I->getType()->isPtrOrPtrVectorTy() && "Illegal to ask for the base pointer of a non-pointer type"); if (I->getType()->isVectorTy()) return findBaseDefiningValueOfVector(I); if (isa(I)) // An incoming argument to the function is a base pointer // We should have never reached here if this argument isn't an gc value return BaseDefiningValueResult(I, true); if (isa(I)) // We assume that objects with a constant base (e.g. a global) can't move // and don't need to be reported to the collector because they are always // live. All constants have constant bases. Besides global references, all // kinds of constants (e.g. undef, constant expressions, null pointers) can // be introduced by the inliner or the optimizer, especially on dynamically // dead paths. See e.g. test4 in constants.ll. return BaseDefiningValueResult(I, true); if (CastInst *CI = dyn_cast(I)) { Value *Def = CI->stripPointerCasts(); // If stripping pointer casts changes the address space there is an // addrspacecast in between. assert(cast(Def->getType())->getAddressSpace() == cast(CI->getType())->getAddressSpace() && "unsupported addrspacecast"); // If we find a cast instruction here, it means we've found a cast which is // not simply a pointer cast (i.e. an inttoptr). We don't know how to // handle int->ptr conversion. assert(!isa(Def) && "shouldn't find another cast here"); return findBaseDefiningValue(Def); } if (isa(I)) // The value loaded is an gc base itself return BaseDefiningValueResult(I, true); if (GetElementPtrInst *GEP = dyn_cast(I)) // The base of this GEP is the base return findBaseDefiningValue(GEP->getPointerOperand()); if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { default: // fall through to general call handling break; case Intrinsic::experimental_gc_statepoint: llvm_unreachable("statepoints don't produce pointers"); case Intrinsic::experimental_gc_relocate: { // Rerunning safepoint insertion after safepoints are already // inserted is not supported. It could probably be made to work, // but why are you doing this? There's no good reason. llvm_unreachable("repeat safepoint insertion is not supported"); } case Intrinsic::gcroot: // Currently, this mechanism hasn't been extended to work with gcroot. // There's no reason it couldn't be, but I haven't thought about the // implications much. llvm_unreachable( "interaction with the gcroot mechanism is not supported"); } } // We assume that functions in the source language only return base // pointers. This should probably be generalized via attributes to support // both source language and internal functions. if (isa(I) || isa(I)) return BaseDefiningValueResult(I, true); // I have absolutely no idea how to implement this part yet. It's not // necessarily hard, I just haven't really looked at it yet. assert(!isa(I) && "Landing Pad is unimplemented"); if (isa(I)) // A CAS is effectively a atomic store and load combined under a // predicate. From the perspective of base pointers, we just treat it // like a load. return BaseDefiningValueResult(I, true); assert(!isa(I) && "Xchg handled above, all others are " "binary ops which don't apply to pointers"); // The aggregate ops. Aggregates can either be in the heap or on the // stack, but in either case, this is simply a field load. As a result, // this is a defining definition of the base just like a load is. if (isa(I)) return BaseDefiningValueResult(I, true); // We should never see an insert vector since that would require we be // tracing back a struct value not a pointer value. assert(!isa(I) && "Base pointer for a struct is meaningless"); // An extractelement produces a base result exactly when it's input does. // We may need to insert a parallel instruction to extract the appropriate // element out of the base vector corresponding to the input. Given this, // it's analogous to the phi and select case even though it's not a merge. if (isa(I)) // Note: There a lot of obvious peephole cases here. This are deliberately // handled after the main base pointer inference algorithm to make writing // test cases to exercise that code easier. return BaseDefiningValueResult(I, false); // The last two cases here don't return a base pointer. Instead, they // return a value which dynamically selects from among several base // derived pointers (each with it's own base potentially). It's the job of // the caller to resolve these. assert((isa(I) || isa(I)) && "missing instruction case in findBaseDefiningValing"); return BaseDefiningValueResult(I, false); } /// Returns the base defining value for this value. static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { Value *&Cached = Cache[I]; if (!Cached) { Cached = findBaseDefiningValue(I).BDV; DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() << "\n"); } assert(Cache[I] != nullptr); return Cached; } /// Return a base pointer for this value if known. Otherwise, return it's /// base defining value. static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { Value *Def = findBaseDefiningValueCached(I, Cache); auto Found = Cache.find(Def); if (Found != Cache.end()) { // Either a base-of relation, or a self reference. Caller must check. return Found->second; } // Only a BDV available return Def; } /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, /// is it known to be a base pointer? Or do we need to continue searching. static bool isKnownBaseResult(Value *V) { if (!isa(V) && !isa(V) && !isa(V) && !isa(V) && !isa(V)) { // no recursion possible return true; } if (isa(V) && cast(V)->getMetadata("is_base_value")) { // This is a previously inserted base phi or select. We know // that this is a base value. return true; } // We need to keep searching return false; } namespace { /// Models the state of a single base defining value in the findBasePointer /// algorithm for determining where a new instruction is needed to propagate /// the base of this BDV. class BDVState { public: enum Status { Unknown, Base, Conflict }; BDVState(Status s, Value *b = nullptr) : status(s), base(b) { assert(status != Base || b); } explicit BDVState(Value *b) : status(Base), base(b) {} BDVState() : status(Unknown), base(nullptr) {} Status getStatus() const { return status; } Value *getBase() const { return base; } bool isBase() const { return getStatus() == Base; } bool isUnknown() const { return getStatus() == Unknown; } bool isConflict() const { return getStatus() == Conflict; } bool operator==(const BDVState &other) const { return base == other.base && status == other.status; } bool operator!=(const BDVState &other) const { return !(*this == other); } LLVM_DUMP_METHOD void dump() const { print(dbgs()); dbgs() << '\n'; } void print(raw_ostream &OS) const { switch (status) { case Unknown: OS << "U"; break; case Base: OS << "B"; break; case Conflict: OS << "C"; break; }; OS << " (" << base << " - " << (base ? base->getName() : "nullptr") << "): "; } private: Status status; AssertingVH base; // non null only if status == base }; } #ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { State.print(OS); return OS; } #endif namespace { // Values of type BDVState form a lattice, and this is a helper // class that implementes the meet operation. The meat of the meet // operation is implemented in MeetBDVStates::pureMeet class MeetBDVStates { public: /// Initializes the currentResult to the TOP state so that if can be met with /// any other state to produce that state. MeetBDVStates() {} // Destructively meet the current result with the given BDVState void meetWith(BDVState otherState) { currentResult = meet(otherState, currentResult); } BDVState getResult() const { return currentResult; } private: BDVState currentResult; /// Perform a meet operation on two elements of the BDVState lattice. static BDVState meet(BDVState LHS, BDVState RHS) { assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && "math is wrong: meet does not commute!"); BDVState Result = pureMeet(LHS, RHS); DEBUG(dbgs() << "meet of " << LHS << " with " << RHS << " produced " << Result << "\n"); return Result; } static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) { switch (stateA.getStatus()) { case BDVState::Unknown: return stateB; case BDVState::Base: assert(stateA.getBase() && "can't be null"); if (stateB.isUnknown()) return stateA; if (stateB.isBase()) { if (stateA.getBase() == stateB.getBase()) { assert(stateA == stateB && "equality broken!"); return stateA; } return BDVState(BDVState::Conflict); } assert(stateB.isConflict() && "only three states!"); return BDVState(BDVState::Conflict); case BDVState::Conflict: return stateA; } llvm_unreachable("only three states!"); } }; } /// For a given value or instruction, figure out what base ptr it's derived /// from. For gc objects, this is simply itself. On success, returns a value /// which is the base pointer. (This is reliable and can be used for /// relocation.) On failure, returns nullptr. static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { Value *def = findBaseOrBDV(I, cache); if (isKnownBaseResult(def)) { return def; } // Here's the rough algorithm: // - For every SSA value, construct a mapping to either an actual base // pointer or a PHI which obscures the base pointer. // - Construct a mapping from PHI to unknown TOP state. Use an // optimistic algorithm to propagate base pointer information. Lattice // looks like: // UNKNOWN // b1 b2 b3 b4 // CONFLICT // When algorithm terminates, all PHIs will either have a single concrete // base or be in a conflict state. // - For every conflict, insert a dummy PHI node without arguments. Add // these to the base[Instruction] = BasePtr mapping. For every // non-conflict, add the actual base. // - For every conflict, add arguments for the base[a] of each input // arguments. // // Note: A simpler form of this would be to add the conflict form of all // PHIs without running the optimistic algorithm. This would be // analogous to pessimistic data flow and would likely lead to an // overall worse solution. #ifndef NDEBUG auto isExpectedBDVType = [](Value *BDV) { return isa(BDV) || isa(BDV) || isa(BDV) || isa(BDV); }; #endif // Once populated, will contain a mapping from each potentially non-base BDV // to a lattice value (described above) which corresponds to that BDV. // We use the order of insertion (DFS over the def/use graph) to provide a // stable deterministic ordering for visiting DenseMaps (which are unordered) // below. This is important for deterministic compilation. MapVector States; // Recursively fill in all base defining values reachable from the initial // one for which we don't already know a definite base value for /* scope */ { SmallVector Worklist; Worklist.push_back(def); States.insert(std::make_pair(def, BDVState())); while (!Worklist.empty()) { Value *Current = Worklist.pop_back_val(); assert(!isKnownBaseResult(Current) && "why did it get added?"); auto visitIncomingValue = [&](Value *InVal) { Value *Base = findBaseOrBDV(InVal, cache); if (isKnownBaseResult(Base)) // Known bases won't need new instructions introduced and can be // ignored safely return; assert(isExpectedBDVType(Base) && "the only non-base values " "we see should be base defining values"); if (States.insert(std::make_pair(Base, BDVState())).second) Worklist.push_back(Base); }; if (PHINode *Phi = dyn_cast(Current)) { for (Value *InVal : Phi->incoming_values()) visitIncomingValue(InVal); } else if (SelectInst *Sel = dyn_cast(Current)) { visitIncomingValue(Sel->getTrueValue()); visitIncomingValue(Sel->getFalseValue()); } else if (auto *EE = dyn_cast(Current)) { visitIncomingValue(EE->getVectorOperand()); } else if (auto *IE = dyn_cast(Current)) { visitIncomingValue(IE->getOperand(0)); // vector operand visitIncomingValue(IE->getOperand(1)); // scalar operand } else { // There is one known class of instructions we know we don't handle. assert(isa(Current)); llvm_unreachable("unimplemented instruction case"); } } } #ifndef NDEBUG DEBUG(dbgs() << "States after initialization:\n"); for (auto Pair : States) { DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif // Return a phi state for a base defining value. We'll generate a new // base state for known bases and expect to find a cached state otherwise. auto getStateForBDV = [&](Value *baseValue) { if (isKnownBaseResult(baseValue)) return BDVState(baseValue); auto I = States.find(baseValue); assert(I != States.end() && "lookup failed!"); return I->second; }; bool progress = true; while (progress) { #ifndef NDEBUG const size_t oldSize = States.size(); #endif progress = false; // We're only changing values in this loop, thus safe to keep iterators. // Since this is computing a fixed point, the order of visit does not // effect the result. TODO: We could use a worklist here and make this run // much faster. for (auto Pair : States) { Value *BDV = Pair.first; assert(!isKnownBaseResult(BDV) && "why did it get added?"); // Given an input value for the current instruction, return a BDVState // instance which represents the BDV of that value. auto getStateForInput = [&](Value *V) mutable { Value *BDV = findBaseOrBDV(V, cache); return getStateForBDV(BDV); }; MeetBDVStates calculateMeet; if (SelectInst *select = dyn_cast(BDV)) { calculateMeet.meetWith(getStateForInput(select->getTrueValue())); calculateMeet.meetWith(getStateForInput(select->getFalseValue())); } else if (PHINode *Phi = dyn_cast(BDV)) { for (Value *Val : Phi->incoming_values()) calculateMeet.meetWith(getStateForInput(Val)); } else if (auto *EE = dyn_cast(BDV)) { // The 'meet' for an extractelement is slightly trivial, but it's still // useful in that it drives us to conflict if our input is. calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); } else { // Given there's a inherent type mismatch between the operands, will // *always* produce Conflict. auto *IE = cast(BDV); calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); } BDVState oldState = States[BDV]; BDVState newState = calculateMeet.getResult(); if (oldState != newState) { progress = true; States[BDV] = newState; } } assert(oldSize == States.size() && "fixed point shouldn't be adding any new nodes to state"); } #ifndef NDEBUG DEBUG(dbgs() << "States after meet iteration:\n"); for (auto Pair : States) { DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } #endif // Insert Phis for all conflicts // TODO: adjust naming patterns to avoid this order of iteration dependency for (auto Pair : States) { Instruction *I = cast(Pair.first); BDVState State = Pair.second; assert(!isKnownBaseResult(I) && "why did it get added?"); assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); // extractelement instructions are a bit special in that we may need to // insert an extract even when we know an exact base for the instruction. // The problem is that we need to convert from a vector base to a scalar // base for the particular indice we're interested in. if (State.isBase() && isa(I) && isa(State.getBase()->getType())) { auto *EE = cast(I); // TODO: In many cases, the new instruction is just EE itself. We should // exploit this, but can't do it here since it would break the invariant // about the BDV not being known to be a base. auto *BaseInst = ExtractElementInst::Create(State.getBase(), EE->getIndexOperand(), "base_ee", EE); BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); States[I] = BDVState(BDVState::Base, BaseInst); } // Since we're joining a vector and scalar base, they can never be the // same. As a result, we should always see insert element having reached // the conflict state. if (isa(I)) { assert(State.isConflict()); } if (!State.isConflict()) continue; /// Create and insert a new instruction which will represent the base of /// the given instruction 'I'. auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { if (isa(I)) { BasicBlock *BB = I->getParent(); int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); assert(NumPreds > 0 && "how did we reach here"); std::string Name = suffixed_name_or(I, ".base", "base_phi"); return PHINode::Create(I->getType(), NumPreds, Name, I); } else if (SelectInst *Sel = dyn_cast(I)) { // The undef will be replaced later UndefValue *Undef = UndefValue::get(Sel->getType()); std::string Name = suffixed_name_or(I, ".base", "base_select"); return SelectInst::Create(Sel->getCondition(), Undef, Undef, Name, Sel); } else if (auto *EE = dyn_cast(I)) { UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); std::string Name = suffixed_name_or(I, ".base", "base_ee"); return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, EE); } else { auto *IE = cast(I); UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); std::string Name = suffixed_name_or(I, ".base", "base_ie"); return InsertElementInst::Create(VecUndef, ScalarUndef, IE->getOperand(2), Name, IE); } }; Instruction *BaseInst = MakeBaseInstPlaceholder(I); // Add metadata marking this as a base value BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); States[I] = BDVState(BDVState::Conflict, BaseInst); } // Returns a instruction which produces the base pointer for a given // instruction. The instruction is assumed to be an input to one of the BDVs // seen in the inference algorithm above. As such, we must either already // know it's base defining value is a base, or have inserted a new // instruction to propagate the base of it's BDV and have entered that newly // introduced instruction into the state table. In either case, we are // assured to be able to determine an instruction which produces it's base // pointer. auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { Value *BDV = findBaseOrBDV(Input, cache); Value *Base = nullptr; if (isKnownBaseResult(BDV)) { Base = BDV; } else { // Either conflict or base. assert(States.count(BDV)); Base = States[BDV].getBase(); } assert(Base && "can't be null"); // The cast is needed since base traversal may strip away bitcasts if (Base->getType() != Input->getType() && InsertPt) { Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt); } return Base; }; // Fixup all the inputs of the new PHIs. Visit order needs to be // deterministic and predictable because we're naming newly created // instructions. for (auto Pair : States) { Instruction *BDV = cast(Pair.first); BDVState State = Pair.second; assert(!isKnownBaseResult(BDV) && "why did it get added?"); assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); if (!State.isConflict()) continue; if (PHINode *basephi = dyn_cast(State.getBase())) { PHINode *phi = cast(BDV); unsigned NumPHIValues = phi->getNumIncomingValues(); for (unsigned i = 0; i < NumPHIValues; i++) { Value *InVal = phi->getIncomingValue(i); BasicBlock *InBB = phi->getIncomingBlock(i); // If we've already seen InBB, add the same incoming value // we added for it earlier. The IR verifier requires phi // nodes with multiple entries from the same basic block // to have the same incoming value for each of those // entries. If we don't do this check here and basephi // has a different type than base, we'll end up adding two // bitcasts (and hence two distinct values) as incoming // values for the same basic block. int blockIndex = basephi->getBasicBlockIndex(InBB); if (blockIndex != -1) { Value *oldBase = basephi->getIncomingValue(blockIndex); basephi->addIncoming(oldBase, InBB); #ifndef NDEBUG Value *Base = getBaseForInput(InVal, nullptr); // In essence this assert states: the only way two // values incoming from the same basic block may be // different is by being different bitcasts of the same // value. A cleanup that remains TODO is changing // findBaseOrBDV to return an llvm::Value of the correct // type (and still remain pure). This will remove the // need to add bitcasts. assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() && "sanity -- findBaseOrBDV should be pure!"); #endif continue; } // Find the instruction which produces the base for each input. We may // need to insert a bitcast in the incoming block. // TODO: Need to split critical edges if insertion is needed Value *Base = getBaseForInput(InVal, InBB->getTerminator()); basephi->addIncoming(Base, InBB); } assert(basephi->getNumIncomingValues() == NumPHIValues); } else if (SelectInst *BaseSel = dyn_cast(State.getBase())) { SelectInst *Sel = cast(BDV); // Operand 1 & 2 are true, false path respectively. TODO: refactor to // something more safe and less hacky. for (int i = 1; i <= 2; i++) { Value *InVal = Sel->getOperand(i); // Find the instruction which produces the base for each input. We may // need to insert a bitcast. Value *Base = getBaseForInput(InVal, BaseSel); BaseSel->setOperand(i, Base); } } else if (auto *BaseEE = dyn_cast(State.getBase())) { Value *InVal = cast(BDV)->getVectorOperand(); // Find the instruction which produces the base for each input. We may // need to insert a bitcast. Value *Base = getBaseForInput(InVal, BaseEE); BaseEE->setOperand(0, Base); } else { auto *BaseIE = cast(State.getBase()); auto *BdvIE = cast(BDV); auto UpdateOperand = [&](int OperandIdx) { Value *InVal = BdvIE->getOperand(OperandIdx); Value *Base = getBaseForInput(InVal, BaseIE); BaseIE->setOperand(OperandIdx, Base); }; UpdateOperand(0); // vector operand UpdateOperand(1); // scalar operand } } // Now that we're done with the algorithm, see if we can optimize the // results slightly by reducing the number of new instructions needed. // Arguably, this should be integrated into the algorithm above, but // doing as a post process step is easier to reason about for the moment. DenseMap ReverseMap; SmallPtrSet NewInsts; SmallSetVector, 16> Worklist; // Note: We need to visit the states in a deterministic order. We uses the // Keys we sorted above for this purpose. Note that we are papering over a // bigger problem with the algorithm above - it's visit order is not // deterministic. A larger change is needed to fix this. for (auto Pair : States) { auto *BDV = Pair.first; auto State = Pair.second; Value *Base = State.getBase(); assert(BDV && Base); assert(!isKnownBaseResult(BDV) && "why did it get added?"); assert(isKnownBaseResult(Base) && "must be something we 'know' is a base pointer"); if (!State.isConflict()) continue; ReverseMap[Base] = BDV; if (auto *BaseI = dyn_cast(Base)) { NewInsts.insert(BaseI); Worklist.insert(BaseI); } } auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, Value *Replacement) { // Add users which are new instructions (excluding self references) for (User *U : BaseI->users()) if (auto *UI = dyn_cast(U)) if (NewInsts.count(UI) && UI != BaseI) Worklist.insert(UI); // Then do the actual replacement NewInsts.erase(BaseI); ReverseMap.erase(BaseI); BaseI->replaceAllUsesWith(Replacement); assert(States.count(BDV)); assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); States[BDV] = BDVState(BDVState::Conflict, Replacement); BaseI->eraseFromParent(); }; const DataLayout &DL = cast(def)->getModule()->getDataLayout(); while (!Worklist.empty()) { Instruction *BaseI = Worklist.pop_back_val(); assert(NewInsts.count(BaseI)); Value *Bdv = ReverseMap[BaseI]; if (auto *BdvI = dyn_cast(Bdv)) if (BaseI->isIdenticalTo(BdvI)) { DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); ReplaceBaseInstWith(Bdv, BaseI, Bdv); continue; } if (Value *V = SimplifyInstruction(BaseI, DL)) { DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); ReplaceBaseInstWith(Bdv, BaseI, V); continue; } } // Cache all of our results so we can cheaply reuse them // NOTE: This is actually two caches: one of the base defining value // relation and one of the base pointer relation! FIXME for (auto Pair : States) { auto *BDV = Pair.first; Value *base = Pair.second.getBase(); assert(BDV && base); std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; DEBUG(dbgs() << "Updating base value cache" << " for: " << BDV->getName() << " from: " << fromstr << " to: " << base->getName() << "\n"); if (cache.count(BDV)) { // Once we transition from the BDV relation being store in the cache to // the base relation being stored, it must be stable assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) && "base relation should be stable"); } cache[BDV] = base; } assert(cache.count(def)); return cache[def]; } // For a set of live pointers (base and/or derived), identify the base // pointer of the object which they are derived from. This routine will // mutate the IR graph as needed to make the 'base' pointer live at the // definition site of 'derived'. This ensures that any use of 'derived' can // also use 'base'. This may involve the insertion of a number of // additional PHI nodes. // // preconditions: live is a set of pointer type Values // // side effects: may insert PHI nodes into the existing CFG, will preserve // CFG, will not remove or mutate any existing nodes // // post condition: PointerToBase contains one (derived, base) pair for every // pointer in live. Note that derived can be equal to base if the original // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, DenseMap &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache) { // For the naming of values inserted to be deterministic - which makes for // much cleaner and more stable tests - we need to assign an order to the // live values. DenseSets do not provide a deterministic order across runs. SmallVector Temp; Temp.insert(Temp.end(), live.begin(), live.end()); std::sort(Temp.begin(), Temp.end(), order_by_name); for (Value *ptr : Temp) { Value *base = findBasePointer(ptr, DVCache); assert(base && "failed to find base pointer"); PointerToBase[ptr] = base; assert((!isa(base) || !isa(ptr) || DT->dominates(cast(base)->getParent(), cast(ptr)->getParent())) && "The base we found better dominate the derived pointer"); // If you see this trip and like to live really dangerously, the code should // be correct, just with idioms the verifier can't handle. You can try // disabling the verifier at your own substantial risk. assert(!isa(base) && "the relocation code needs adjustment to handle the relocation of " "a null pointer constant without causing false positives in the " "safepoint ir verifier."); } } /// Find the required based pointers (and adjust the live set) for the given /// parse point. static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, const CallSite &CS, PartiallyConstructedSafepointRecord &result) { DenseMap PointerToBase; findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); if (PrintBasePointers) { // Note: Need to print these in a stable order since this is checked in // some tests. errs() << "Base Pairs (w/o Relocation):\n"; SmallVector Temp; Temp.reserve(PointerToBase.size()); for (auto Pair : PointerToBase) { Temp.push_back(Pair.first); } std::sort(Temp.begin(), Temp.end(), order_by_name); for (Value *Ptr : Temp) { Value *Base = PointerToBase[Ptr]; errs() << " derived "; Ptr->printAsOperand(errs(), false); errs() << " base "; Base->printAsOperand(errs(), false); errs() << "\n";; } } result.PointerToBase = PointerToBase; } /// Given an updated version of the dataflow liveness results, update the /// liveset and base pointer maps for the call site CS. static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, const CallSite &CS, PartiallyConstructedSafepointRecord &result); static void recomputeLiveInValues( Function &F, DominatorTree &DT, ArrayRef toUpdate, MutableArrayRef records) { // TODO-PERF: reuse the original liveness, then simply run the dataflow // again. The old values are still live and will help it stabilize quickly. GCPtrLivenessData RevisedLivenessData; computeLiveInValues(DT, F, RevisedLivenessData); for (size_t i = 0; i < records.size(); i++) { struct PartiallyConstructedSafepointRecord &info = records[i]; const CallSite &CS = toUpdate[i]; recomputeLiveInValues(RevisedLivenessData, CS, info); } } // When inserting gc.relocate and gc.result calls, we need to ensure there are // no uses of the original value / return value between the gc.statepoint and // the gc.relocate / gc.result call. One case which can arise is a phi node // starting one of the successor blocks. We also need to be able to insert the // gc.relocates only on the path which goes through the statepoint. We might // need to split an edge to make this possible. static BasicBlock * normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, DominatorTree &DT) { BasicBlock *Ret = BB; if (!BB->getUniquePredecessor()) Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); // Now that 'Ret' has unique predecessor we can safely remove all phi nodes // from it FoldSingleEntryPHINodes(Ret); assert(!isa(Ret->begin()) && "All PHI nodes should have been removed!"); // At this point, we can safely insert a gc.relocate or gc.result as the first // instruction in Ret if needed. return Ret; } // Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeSet legalizeCallAttributes(AttributeSet AS) { AttributeSet Ret; for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { unsigned Index = AS.getSlotIndex(Slot); if (Index == AttributeSet::ReturnIndex || Index == AttributeSet::FunctionIndex) { for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) { // Do not allow certain attributes - just skip them // Safepoint can not be read only or read none. if (Attr.hasAttribute(Attribute::ReadNone) || Attr.hasAttribute(Attribute::ReadOnly)) continue; // These attributes control the generation of the gc.statepoint call / // invoke itself; and once the gc.statepoint is in place, they're of no // use. if (Attr.hasAttribute("statepoint-num-patch-bytes") || Attr.hasAttribute("statepoint-id")) continue; Ret = Ret.addAttributes( AS.getContext(), Index, AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr))); } } // Just skip parameter attributes for now } return Ret; } /// Helper function to place all gc relocates necessary for the given /// statepoint. /// Inputs: /// liveVariables - list of variables to be relocated. /// liveStart - index of the first live variable. /// basePtrs - base pointers. /// statepointToken - statepoint instruction to which relocates should be /// bound. /// Builder - Llvm IR builder to be used to construct new calls. static void CreateGCRelocates(ArrayRef LiveVariables, const int LiveStart, ArrayRef BasePtrs, Instruction *StatepointToken, IRBuilder<> Builder) { if (LiveVariables.empty()) return; auto FindIndex = [](ArrayRef LiveVec, Value *Val) { auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val); assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); size_t Index = std::distance(LiveVec.begin(), ValIt); assert(Index < LiveVec.size() && "Bug in std::find?"); return Index; }; Module *M = StatepointToken->getModule(); // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose // element type is i8 addrspace(1)*). We originally generated unique // declarations for each pointer type, but this proved problematic because // the intrinsic mangling code is incomplete and fragile. Since we're moving // towards a single unified pointer type anyways, we can just cast everything // to an i8* of the right address space. A bitcast is added later to convert // gc_relocate to the actual value's type. auto getGCRelocateDecl = [&] (Type *Ty) { assert(isHandledGCPointerType(Ty)); auto AS = Ty->getScalarType()->getPointerAddressSpace(); Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS); if (auto *VT = dyn_cast(Ty)) NewTy = VectorType::get(NewTy, VT->getNumElements()); return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, {NewTy}); }; // Lazily populated map from input types to the canonicalized form mentioned // in the comment above. This should probably be cached somewhere more // broadly. DenseMap TypeToDeclMap; for (unsigned i = 0; i < LiveVariables.size(); i++) { // Generate the gc.relocate call and save the result Value *BaseIdx = Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i])); Value *LiveIdx = Builder.getInt32(LiveStart + i); Type *Ty = LiveVariables[i]->getType(); if (!TypeToDeclMap.count(Ty)) TypeToDeclMap[Ty] = getGCRelocateDecl(Ty); Value *GCRelocateDecl = TypeToDeclMap[Ty]; // only specify a debug name if we can give a useful one CallInst *Reloc = Builder.CreateCall( GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, suffixed_name_or(LiveVariables[i], ".relocated", "")); // Trick CodeGen into thinking there are lots of free registers at this // fake call. Reloc->setCallingConv(CallingConv::Cold); } } namespace { /// This struct is used to defer RAUWs and `eraseFromParent` s. Using this /// avoids having to worry about keeping around dangling pointers to Values. class DeferredReplacement { AssertingVH Old; AssertingVH New; public: explicit DeferredReplacement(Instruction *Old, Instruction *New) : Old(Old), New(New) { assert(Old != New && "Not allowed!"); } /// Does the task represented by this instance. void doReplacement() { Instruction *OldI = Old; Instruction *NewI = New; assert(OldI != NewI && "Disallowed at construction?!"); Old = nullptr; New = nullptr; if (NewI) OldI->replaceAllUsesWith(NewI); OldI->eraseFromParent(); } }; } static void makeStatepointExplicitImpl(const CallSite CS, /* to replace */ const SmallVectorImpl &BasePtrs, const SmallVectorImpl &LiveVariables, PartiallyConstructedSafepointRecord &Result, std::vector &Replacements) { assert(BasePtrs.size() == LiveVariables.size()); assert((UseDeoptBundles || isStatepoint(CS)) && "This method expects to be rewriting a statepoint"); // Then go ahead and use the builder do actually do the inserts. We insert // immediately before the previous instruction under the assumption that all // arguments will be available here. We can't insert afterwards since we may // be replacing a terminator. Instruction *InsertBefore = CS.getInstruction(); IRBuilder<> Builder(InsertBefore); ArrayRef GCArgs(LiveVariables); uint64_t StatepointID = 0xABCDEF00; uint32_t NumPatchBytes = 0; uint32_t Flags = uint32_t(StatepointFlags::None); ArrayRef CallArgs; ArrayRef DeoptArgs; ArrayRef TransitionArgs; Value *CallTarget = nullptr; if (UseDeoptBundles) { CallArgs = {CS.arg_begin(), CS.arg_end()}; DeoptArgs = GetDeoptBundleOperands(CS); // TODO: we don't fill in TransitionArgs or Flags in this branch, but we // could have an operand bundle for that too. AttributeSet OriginalAttrs = CS.getAttributes(); Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id"); if (AttrID.isStringAttribute()) AttrID.getValueAsString().getAsInteger(10, StatepointID); Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); if (AttrNumPatchBytes.isStringAttribute()) AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); CallTarget = CS.getCalledValue(); } else { // This branch will be gone soon, and we will soon only support the // UseDeoptBundles == true configuration. Statepoint OldSP(CS); StatepointID = OldSP.getID(); NumPatchBytes = OldSP.getNumPatchBytes(); Flags = OldSP.getFlags(); CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; TransitionArgs = {OldSP.gc_transition_args_begin(), OldSP.gc_transition_args_end()}; CallTarget = OldSP.getCalledValue(); } // Create the statepoint given all the arguments Instruction *Token = nullptr; AttributeSet ReturnAttrs; if (CS.isCall()) { CallInst *ToReplace = cast(CS.getInstruction()); CallInst *Call = Builder.CreateGCStatepointCall( StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); Call->setTailCall(ToReplace->isTailCall()); Call->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. Call->setAttributes(NewAttrs.getFnAttributes()); ReturnAttrs = NewAttrs.getRetAttributes(); Token = Call; // Put the following gc_result and gc_relocate calls immediately after the // the old call (which we're about to delete) assert(ToReplace->getNextNode() && "Not a terminator, must have next!"); Builder.SetInsertPoint(ToReplace->getNextNode()); Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); } else { InvokeInst *ToReplace = cast(CS.getInstruction()); // Insert the new invoke into the old block. We'll remove the old one in a // moment at which point this will become the new terminator for the // original block. InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(), ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs, "statepoint_token"); Invoke->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. Invoke->setAttributes(NewAttrs.getFnAttributes()); ReturnAttrs = NewAttrs.getRetAttributes(); Token = Invoke; // Generate gc relocates in exceptional path BasicBlock *UnwindBlock = ToReplace->getUnwindDest(); assert(!isa(UnwindBlock->begin()) && UnwindBlock->getUniquePredecessor() && "can't safely insert in this block!"); Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); // Attach exceptional gc relocates to the landingpad. Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); Result.UnwindToken = ExceptionalToken; const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken, Builder); // Generate gc relocates and returns for normal block BasicBlock *NormalDest = ToReplace->getNormalDest(); assert(!isa(NormalDest->begin()) && NormalDest->getUniquePredecessor() && "can't safely insert in this block!"); Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt()); // gc relocates will be generated later as if it were regular call // statepoint } assert(Token && "Should be set in one of the above branches!"); if (UseDeoptBundles) { Token->setName("statepoint_token"); if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { StringRef Name = CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); GCResult->setAttributes(CS.getAttributes().getRetAttributes()); // We cannot RAUW or delete CS.getInstruction() because it could be in the // live set of some other safepoint, in which case that safepoint's // PartiallyConstructedSafepointRecord will hold a raw pointer to this // llvm::Instruction. Instead, we defer the replacement and deletion to // after the live sets have been made explicit in the IR, and we no longer // have raw pointers to worry about. Replacements.emplace_back(CS.getInstruction(), GCResult); } else { Replacements.emplace_back(CS.getInstruction(), nullptr); } } else { assert(!CS.getInstruction()->hasNUsesOrMore(2) && "only valid use before rewrite is gc.result"); assert(!CS.getInstruction()->hasOneUse() || isGCResult(cast(*CS.getInstruction()->user_begin()))); // Take the name of the original statepoint token if there was one. Token->takeName(CS.getInstruction()); // Update the gc.result of the original statepoint (if any) to use the newly // inserted statepoint. This is safe to do here since the token can't be // considered a live reference. CS.getInstruction()->replaceAllUsesWith(Token); CS.getInstruction()->eraseFromParent(); } Result.StatepointToken = Token; // Second, create a gc.relocate for every live variable const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder); } namespace { struct NameOrdering { Value *Base; Value *Derived; bool operator()(NameOrdering const &a, NameOrdering const &b) { return -1 == a.Derived->getName().compare(b.Derived->getName()); } }; } static void StabilizeOrder(SmallVectorImpl &BaseVec, SmallVectorImpl &LiveVec) { assert(BaseVec.size() == LiveVec.size()); SmallVector Temp; for (size_t i = 0; i < BaseVec.size(); i++) { NameOrdering v; v.Base = BaseVec[i]; v.Derived = LiveVec[i]; Temp.push_back(v); } std::sort(Temp.begin(), Temp.end(), NameOrdering()); for (size_t i = 0; i < BaseVec.size(); i++) { BaseVec[i] = Temp[i].Base; LiveVec[i] = Temp[i].Derived; } } // Replace an existing gc.statepoint with a new one and a set of gc.relocates // which make the relocations happening at this safepoint explicit. // // WARNING: Does not do any fixup to adjust users of the original live // values. That's the callers responsibility. static void makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, PartiallyConstructedSafepointRecord &Result, std::vector &Replacements) { const auto &LiveSet = Result.LiveSet; const auto &PointerToBase = Result.PointerToBase; // Convert to vector for efficient cross referencing. SmallVector BaseVec, LiveVec; LiveVec.reserve(LiveSet.size()); BaseVec.reserve(LiveSet.size()); for (Value *L : LiveSet) { LiveVec.push_back(L); assert(PointerToBase.count(L)); Value *Base = PointerToBase.find(L)->second; BaseVec.push_back(Base); } assert(LiveVec.size() == BaseVec.size()); // To make the output IR slightly more stable (for use in diffs), ensure a // fixed order of the values in the safepoint (by sorting the value name). // The order is otherwise meaningless. StabilizeOrder(BaseVec, LiveVec); // Do the actual rewriting and delete the old statepoint makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements); } // Helper function for the relocationViaAlloca. // // It receives iterator to the statepoint gc relocates and emits a store to the // assigned location (via allocaMap) for the each one of them. It adds the // visited values into the visitedLiveValues set, which we will later use them // for sanity checking. static void insertRelocationStores(iterator_range GCRelocs, DenseMap &AllocaMap, DenseSet &VisitedLiveValues) { for (User *U : GCRelocs) { GCRelocateInst *Relocate = dyn_cast(U); if (!Relocate) continue; Value *OriginalValue = const_cast(Relocate->getDerivedPtr()); assert(AllocaMap.count(OriginalValue)); Value *Alloca = AllocaMap[OriginalValue]; // Emit store into the related alloca // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to // the correct type according to alloca. assert(Relocate->getNextNode() && "Should always have one since it's not a terminator"); IRBuilder<> Builder(Relocate->getNextNode()); Value *CastedRelocatedValue = Builder.CreateBitCast(Relocate, cast(Alloca)->getAllocatedType(), suffixed_name_or(Relocate, ".casted", "")); StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca); Store->insertAfter(cast(CastedRelocatedValue)); #ifndef NDEBUG VisitedLiveValues.insert(OriginalValue); #endif } } // Helper function for the "relocationViaAlloca". Similar to the // "insertRelocationStores" but works for rematerialized values. static void insertRematerializationStores( RematerializedValueMapTy RematerializedValues, DenseMap &AllocaMap, DenseSet &VisitedLiveValues) { for (auto RematerializedValuePair: RematerializedValues) { Instruction *RematerializedValue = RematerializedValuePair.first; Value *OriginalValue = RematerializedValuePair.second; assert(AllocaMap.count(OriginalValue) && "Can not find alloca for rematerialized value"); Value *Alloca = AllocaMap[OriginalValue]; StoreInst *Store = new StoreInst(RematerializedValue, Alloca); Store->insertAfter(RematerializedValue); #ifndef NDEBUG VisitedLiveValues.insert(OriginalValue); #endif } } /// Do all the relocation update via allocas and mem2reg static void relocationViaAlloca( Function &F, DominatorTree &DT, ArrayRef Live, ArrayRef Records) { #ifndef NDEBUG // record initial number of (static) allocas; we'll check we have the same // number when we get done. int InitialAllocaNum = 0; for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; I++) if (isa(*I)) InitialAllocaNum++; #endif // TODO-PERF: change data structures, reserve DenseMap AllocaMap; SmallVector PromotableAllocas; // Used later to chack that we have enough allocas to store all values std::size_t NumRematerializedValues = 0; PromotableAllocas.reserve(Live.size()); // Emit alloca for "LiveValue" and record it in "allocaMap" and // "PromotableAllocas" auto emitAllocaFor = [&](Value *LiveValue) { AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "", F.getEntryBlock().getFirstNonPHI()); AllocaMap[LiveValue] = Alloca; PromotableAllocas.push_back(Alloca); }; // Emit alloca for each live gc pointer for (Value *V : Live) emitAllocaFor(V); // Emit allocas for rematerialized values for (const auto &Info : Records) for (auto RematerializedValuePair : Info.RematerializedValues) { Value *OriginalValue = RematerializedValuePair.second; if (AllocaMap.count(OriginalValue) != 0) continue; emitAllocaFor(OriginalValue); ++NumRematerializedValues; } // The next two loops are part of the same conceptual operation. We need to // insert a store to the alloca after the original def and at each // redefinition. We need to insert a load before each use. These are split // into distinct loops for performance reasons. // Update gc pointer after each statepoint: either store a relocated value or // null (if no relocated value was found for this gc pointer and it is not a // gc_result). This must happen before we update the statepoint with load of // alloca otherwise we lose the link between statepoint and old def. for (const auto &Info : Records) { Value *Statepoint = Info.StatepointToken; // This will be used for consistency check DenseSet VisitedLiveValues; // Insert stores for normal statepoint gc relocates insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues); // In case if it was invoke statepoint // we will insert stores for exceptional path gc relocates. if (isa(Statepoint)) { insertRelocationStores(Info.UnwindToken->users(), AllocaMap, VisitedLiveValues); } // Do similar thing with rematerialized values insertRematerializationStores(Info.RematerializedValues, AllocaMap, VisitedLiveValues); if (ClobberNonLive) { // As a debugging aid, pretend that an unrelocated pointer becomes null at // the gc.statepoint. This will turn some subtle GC problems into // slightly easier to debug SEGVs. Note that on large IR files with // lots of gc.statepoints this is extremely costly both memory and time // wise. SmallVector ToClobber; for (auto Pair : AllocaMap) { Value *Def = Pair.first; AllocaInst *Alloca = cast(Pair.second); // This value was relocated if (VisitedLiveValues.count(Def)) { continue; } ToClobber.push_back(Alloca); } auto InsertClobbersAt = [&](Instruction *IP) { for (auto *AI : ToClobber) { - auto AIType = cast(AI->getType()); - auto PT = cast(AIType->getElementType()); + auto PT = cast(AI->getAllocatedType()); Constant *CPN = ConstantPointerNull::get(PT); StoreInst *Store = new StoreInst(CPN, AI); Store->insertBefore(IP); } }; // Insert the clobbering stores. These may get intermixed with the // gc.results and gc.relocates, but that's fine. if (auto II = dyn_cast(Statepoint)) { InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); } else { InsertClobbersAt(cast(Statepoint)->getNextNode()); } } } // Update use with load allocas and add store for gc_relocated. for (auto Pair : AllocaMap) { Value *Def = Pair.first; Value *Alloca = Pair.second; // We pre-record the uses of allocas so that we dont have to worry about // later update that changes the user information.. SmallVector Uses; // PERF: trade a linear scan for repeated reallocation Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); for (User *U : Def->users()) { if (!isa(U)) { // If the def has a ConstantExpr use, then the def is either a // ConstantExpr use itself or null. In either case // (recursively in the first, directly in the second), the oop // it is ultimately dependent on is null and this particular // use does not need to be fixed up. Uses.push_back(cast(U)); } } std::sort(Uses.begin(), Uses.end()); auto Last = std::unique(Uses.begin(), Uses.end()); Uses.erase(Last, Uses.end()); for (Instruction *Use : Uses) { if (isa(Use)) { PHINode *Phi = cast(Use); for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) { if (Def == Phi->getIncomingValue(i)) { LoadInst *Load = new LoadInst( Alloca, "", Phi->getIncomingBlock(i)->getTerminator()); Phi->setIncomingValue(i, Load); } } } else { LoadInst *Load = new LoadInst(Alloca, "", Use); Use->replaceUsesOfWith(Def, Load); } } // Emit store for the initial gc value. Store must be inserted after load, // otherwise store will be in alloca's use list and an extra load will be // inserted before it. StoreInst *Store = new StoreInst(Def, Alloca); if (Instruction *Inst = dyn_cast(Def)) { if (InvokeInst *Invoke = dyn_cast(Inst)) { // InvokeInst is a TerminatorInst so the store need to be inserted // into its normal destination block. BasicBlock *NormalDest = Invoke->getNormalDest(); Store->insertBefore(NormalDest->getFirstNonPHI()); } else { assert(!Inst->isTerminator() && "The only TerminatorInst that can produce a value is " "InvokeInst which is handled above."); Store->insertAfter(Inst); } } else { assert(isa(Def)); Store->insertAfter(cast(Alloca)); } } assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && "we must have the same allocas with lives"); if (!PromotableAllocas.empty()) { // Apply mem2reg to promote alloca to SSA PromoteMemToReg(PromotableAllocas, DT); } #ifndef NDEBUG for (auto &I : F.getEntryBlock()) if (isa(I)) InitialAllocaNum--; assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); #endif } /// Implement a unique function which doesn't require we sort the input /// vector. Doing so has the effect of changing the output of a couple of /// tests in ways which make them less useful in testing fused safepoints. template static void unique_unsorted(SmallVectorImpl &Vec) { SmallSet Seen; Vec.erase(std::remove_if(Vec.begin(), Vec.end(), [&](const T &V) { return !Seen.insert(V).second; }), Vec.end()); } /// Insert holders so that each Value is obviously live through the entire /// lifetime of the call. static void insertUseHolderAfter(CallSite &CS, const ArrayRef Values, SmallVectorImpl &Holders) { if (Values.empty()) // No values to hold live, might as well not insert the empty holder return; Module *M = CS.getInstruction()->getModule(); // Use a dummy vararg function to actually hold the values live Function *Func = cast(M->getOrInsertFunction( "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true))); if (CS.isCall()) { // For call safepoints insert dummy calls right after safepoint Holders.push_back(CallInst::Create(Func, Values, "", &*++CS.getInstruction()->getIterator())); return; } // For invoke safepooints insert dummy calls both in normal and // exceptional destination blocks auto *II = cast(CS.getInstruction()); Holders.push_back(CallInst::Create( Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); Holders.push_back(CallInst::Create( Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); } static void findLiveReferences( Function &F, DominatorTree &DT, ArrayRef toUpdate, MutableArrayRef records) { GCPtrLivenessData OriginalLivenessData; computeLiveInValues(DT, F, OriginalLivenessData); for (size_t i = 0; i < records.size(); i++) { struct PartiallyConstructedSafepointRecord &info = records[i]; const CallSite &CS = toUpdate[i]; analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info); } } /// Remove any vector of pointers from the live set by scalarizing them over the /// statepoint instruction. Adds the scalarized pieces to the live set. It /// would be preferable to include the vector in the statepoint itself, but /// the lowering code currently does not handle that. Extending it would be /// slightly non-trivial since it requires a format change. Given how rare /// such cases are (for the moment?) scalarizing is an acceptable compromise. static void splitVectorValues(Instruction *StatepointInst, StatepointLiveSetTy &LiveSet, DenseMap& PointerToBase, DominatorTree &DT) { SmallVector ToSplit; for (Value *V : LiveSet) if (isa(V->getType())) ToSplit.push_back(V); if (ToSplit.empty()) return; DenseMap> ElementMapping; Function &F = *(StatepointInst->getParent()->getParent()); DenseMap AllocaMap; // First is normal return, second is exceptional return (invoke only) DenseMap> Replacements; for (Value *V : ToSplit) { AllocaInst *Alloca = new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI()); AllocaMap[V] = Alloca; VectorType *VT = cast(V->getType()); IRBuilder<> Builder(StatepointInst); SmallVector Elements; for (unsigned i = 0; i < VT->getNumElements(); i++) Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i))); ElementMapping[V] = Elements; auto InsertVectorReform = [&](Instruction *IP) { Builder.SetInsertPoint(IP); Builder.SetCurrentDebugLocation(IP->getDebugLoc()); Value *ResultVec = UndefValue::get(VT); for (unsigned i = 0; i < VT->getNumElements(); i++) ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i], Builder.getInt32(i)); return ResultVec; }; if (isa(StatepointInst)) { BasicBlock::iterator Next(StatepointInst); Next++; Instruction *IP = &*(Next); Replacements[V].first = InsertVectorReform(IP); Replacements[V].second = nullptr; } else { InvokeInst *Invoke = cast(StatepointInst); // We've already normalized - check that we don't have shared destination // blocks BasicBlock *NormalDest = Invoke->getNormalDest(); assert(!isa(NormalDest->begin())); BasicBlock *UnwindDest = Invoke->getUnwindDest(); assert(!isa(UnwindDest->begin())); // Insert insert element sequences in both successors Instruction *IP = &*(NormalDest->getFirstInsertionPt()); Replacements[V].first = InsertVectorReform(IP); IP = &*(UnwindDest->getFirstInsertionPt()); Replacements[V].second = InsertVectorReform(IP); } } for (Value *V : ToSplit) { AllocaInst *Alloca = AllocaMap[V]; // Capture all users before we start mutating use lists SmallVector Users; for (User *U : V->users()) Users.push_back(cast(U)); for (Instruction *I : Users) { if (auto Phi = dyn_cast(I)) { for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) if (V == Phi->getIncomingValue(i)) { LoadInst *Load = new LoadInst( Alloca, "", Phi->getIncomingBlock(i)->getTerminator()); Phi->setIncomingValue(i, Load); } } else { LoadInst *Load = new LoadInst(Alloca, "", I); I->replaceUsesOfWith(V, Load); } } // Store the original value and the replacement value into the alloca StoreInst *Store = new StoreInst(V, Alloca); if (auto I = dyn_cast(V)) Store->insertAfter(I); else Store->insertAfter(Alloca); // Normal return for invoke, or call return Instruction *Replacement = cast(Replacements[V].first); (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); // Unwind return for invoke only Replacement = cast_or_null(Replacements[V].second); if (Replacement) (new StoreInst(Replacement, Alloca))->insertAfter(Replacement); } // apply mem2reg to promote alloca to SSA SmallVector Allocas; for (Value *V : ToSplit) Allocas.push_back(AllocaMap[V]); PromoteMemToReg(Allocas, DT); // Update our tracking of live pointers and base mappings to account for the // changes we just made. for (Value *V : ToSplit) { auto &Elements = ElementMapping[V]; LiveSet.erase(V); LiveSet.insert(Elements.begin(), Elements.end()); // We need to update the base mapping as well. assert(PointerToBase.count(V)); Value *OldBase = PointerToBase[V]; auto &BaseElements = ElementMapping[OldBase]; PointerToBase.erase(V); assert(Elements.size() == BaseElements.size()); for (unsigned i = 0; i < Elements.size(); i++) { Value *Elem = Elements[i]; PointerToBase[Elem] = BaseElements[i]; } } } // Helper function for the "rematerializeLiveValues". It walks use chain // starting from the "CurrentValue" until it meets "BaseValue". Only "simple" // values are visited (currently it is GEP's and casts). Returns true if it // successfully reached "BaseValue" and false otherwise. // Fills "ChainToBase" array with all visited values. "BaseValue" is not // recorded. static bool findRematerializableChainToBasePointer( SmallVectorImpl &ChainToBase, Value *CurrentValue, Value *BaseValue) { // We have found a base value if (CurrentValue == BaseValue) { return true; } if (GetElementPtrInst *GEP = dyn_cast(CurrentValue)) { ChainToBase.push_back(GEP); return findRematerializableChainToBasePointer(ChainToBase, GEP->getPointerOperand(), BaseValue); } if (CastInst *CI = dyn_cast(CurrentValue)) { if (!CI->isNoopCast(CI->getModule()->getDataLayout())) return false; ChainToBase.push_back(CI); return findRematerializableChainToBasePointer(ChainToBase, CI->getOperand(0), BaseValue); } // Not supported instruction in the chain return false; } // Helper function for the "rematerializeLiveValues". Compute cost of the use // chain we are going to rematerialize. static unsigned chainToBasePointerCost(SmallVectorImpl &Chain, TargetTransformInfo &TTI) { unsigned Cost = 0; for (Instruction *Instr : Chain) { if (CastInst *CI = dyn_cast(Instr)) { assert(CI->isNoopCast(CI->getModule()->getDataLayout()) && "non noop cast is found during rematerialization"); Type *SrcTy = CI->getOperand(0)->getType(); Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy); } else if (GetElementPtrInst *GEP = dyn_cast(Instr)) { // Cost of the address calculation Type *ValTy = GEP->getPointerOperandType()->getPointerElementType(); Cost += TTI.getAddressComputationCost(ValTy); // And cost of the GEP itself // TODO: Use TTI->getGEPCost here (it exists, but appears to be not // allowed for the external usage) if (!GEP->hasAllConstantIndices()) Cost += 2; } else { llvm_unreachable("unsupported instruciton type during rematerialization"); } } return Cost; } // From the statepoint live set pick values that are cheaper to recompute then // to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to // relocated values we don't do any user adjustments here. static void rematerializeLiveValues(CallSite CS, PartiallyConstructedSafepointRecord &Info, TargetTransformInfo &TTI) { const unsigned int ChainLengthThreshold = 10; // Record values we are going to delete from this statepoint live set. // We can not di this in following loop due to iterator invalidation. SmallVector LiveValuesToBeDeleted; for (Value *LiveValue: Info.LiveSet) { // For each live pointer find it's defining chain SmallVector ChainToBase; assert(Info.PointerToBase.count(LiveValue)); bool FoundChain = findRematerializableChainToBasePointer(ChainToBase, LiveValue, Info.PointerToBase[LiveValue]); // Nothing to do, or chain is too long if (!FoundChain || ChainToBase.size() == 0 || ChainToBase.size() > ChainLengthThreshold) continue; // Compute cost of this chain unsigned Cost = chainToBasePointerCost(ChainToBase, TTI); // TODO: We can also account for cases when we will be able to remove some // of the rematerialized values by later optimization passes. I.e if // we rematerialized several intersecting chains. Or if original values // don't have any uses besides this statepoint. // For invokes we need to rematerialize each chain twice - for normal and // for unwind basic blocks. Model this by multiplying cost by two. if (CS.isInvoke()) { Cost *= 2; } // If it's too expensive - skip it if (Cost >= RematerializationThreshold) continue; // Remove value from the live set LiveValuesToBeDeleted.push_back(LiveValue); // Clone instructions and record them inside "Info" structure // Walk backwards to visit top-most instructions first std::reverse(ChainToBase.begin(), ChainToBase.end()); // Utility function which clones all instructions from "ChainToBase" // and inserts them before "InsertBefore". Returns rematerialized value // which should be used after statepoint. auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) { Instruction *LastClonedValue = nullptr; Instruction *LastValue = nullptr; for (Instruction *Instr: ChainToBase) { // Only GEP's and casts are suported as we need to be careful to not // introduce any new uses of pointers not in the liveset. // Note that it's fine to introduce new uses of pointers which were // otherwise not used after this statepoint. assert(isa(Instr) || isa(Instr)); Instruction *ClonedValue = Instr->clone(); ClonedValue->insertBefore(InsertBefore); ClonedValue->setName(Instr->getName() + ".remat"); // If it is not first instruction in the chain then it uses previously // cloned value. We should update it to use cloned value. if (LastClonedValue) { assert(LastValue); ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue); #ifndef NDEBUG // Assert that cloned instruction does not use any instructions from // this chain other than LastClonedValue for (auto OpValue : ClonedValue->operand_values()) { assert(std::find(ChainToBase.begin(), ChainToBase.end(), OpValue) == ChainToBase.end() && "incorrect use in rematerialization chain"); } #endif } LastClonedValue = ClonedValue; LastValue = Instr; } assert(LastClonedValue); return LastClonedValue; }; // Different cases for calls and invokes. For invokes we need to clone // instructions both on normal and unwind path. if (CS.isCall()) { Instruction *InsertBefore = CS.getInstruction()->getNextNode(); assert(InsertBefore); Instruction *RematerializedValue = rematerializeChain(InsertBefore); Info.RematerializedValues[RematerializedValue] = LiveValue; } else { InvokeInst *Invoke = cast(CS.getInstruction()); Instruction *NormalInsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt(); Instruction *UnwindInsertBefore = &*Invoke->getUnwindDest()->getFirstInsertionPt(); Instruction *NormalRematerializedValue = rematerializeChain(NormalInsertBefore); Instruction *UnwindRematerializedValue = rematerializeChain(UnwindInsertBefore); Info.RematerializedValues[NormalRematerializedValue] = LiveValue; Info.RematerializedValues[UnwindRematerializedValue] = LiveValue; } } // Remove rematerializaed values from the live set for (auto LiveValue: LiveValuesToBeDeleted) { Info.LiveSet.erase(LiveValue); } } static bool insertParsePoints(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, SmallVectorImpl &ToUpdate) { #ifndef NDEBUG // sanity check the input std::set Uniqued; Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); for (CallSite CS : ToUpdate) { assert(CS.getInstruction()->getParent()->getParent() == &F); assert((UseDeoptBundles || isStatepoint(CS)) && "expected to already be a deopt statepoint"); } #endif // When inserting gc.relocates for invokes, we need to be able to insert at // the top of the successor blocks. See the comment on // normalForInvokeSafepoint on exactly what is needed. Note that this step // may restructure the CFG. for (CallSite CS : ToUpdate) { if (!CS.isInvoke()) continue; auto *II = cast(CS.getInstruction()); normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT); } // A list of dummy calls added to the IR to keep various values obviously // live in the IR. We'll remove all of these when done. SmallVector Holders; // Insert a dummy call with all of the arguments to the vm_state we'll need // for the actual safepoint insertion. This ensures reference arguments in // the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) for (CallSite CS : ToUpdate) { SmallVector DeoptValues; iterator_range DeoptStateRange = UseDeoptBundles ? iterator_range(GetDeoptBundleOperands(CS)) : iterator_range(Statepoint(CS).vm_state_args()); for (Value *Arg : DeoptStateRange) { assert(!isUnhandledGCPointerType(Arg->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(Arg->getType())) DeoptValues.push_back(Arg); } insertUseHolderAfter(CS, DeoptValues, Holders); } SmallVector Records(ToUpdate.size()); // A) Identify all gc pointers which are statically live at the given call // site. findLiveReferences(F, DT, ToUpdate, Records); // B) Find the base pointers for each live pointer /* scope for caching */ { // Cache the 'defining value' relation used in the computation and // insertion of base phis and selects. This ensures that we don't insert // large numbers of duplicate base_phis. DefiningValueMapTy DVCache; for (size_t i = 0; i < Records.size(); i++) { PartiallyConstructedSafepointRecord &info = Records[i]; findBasePointers(DT, DVCache, ToUpdate[i], info); } } // end of cache scope // The base phi insertion logic (for any safepoint) may have inserted new // instructions which are now live at some safepoint. The simplest such // example is: // loop: // phi a <-- will be a new base_phi here // safepoint 1 <-- that needs to be live here // gep a + 1 // safepoint 2 // br loop // We insert some dummy calls after each safepoint to definitely hold live // the base pointers which were identified for that safepoint. We'll then // ask liveness for _every_ base inserted to see what is now live. Then we // remove the dummy calls. Holders.reserve(Holders.size() + Records.size()); for (size_t i = 0; i < Records.size(); i++) { PartiallyConstructedSafepointRecord &Info = Records[i]; SmallVector Bases; for (auto Pair : Info.PointerToBase) Bases.push_back(Pair.second); insertUseHolderAfter(ToUpdate[i], Bases, Holders); } // By selecting base pointers, we've effectively inserted new uses. Thus, we // need to rerun liveness. We may *also* have inserted new defs, but that's // not the key issue. recomputeLiveInValues(F, DT, ToUpdate, Records); if (PrintBasePointers) { for (auto &Info : Records) { errs() << "Base Pairs: (w/Relocation)\n"; for (auto Pair : Info.PointerToBase) { errs() << " derived "; Pair.first->printAsOperand(errs(), false); errs() << " base "; Pair.second->printAsOperand(errs(), false); errs() << "\n"; } } } // It is possible that non-constant live variables have a constant base. For // example, a GEP with a variable offset from a global. In this case we can // remove it from the liveset. We already don't add constants to the liveset // because we assume they won't move at runtime and the GC doesn't need to be // informed about them. The same reasoning applies if the base is constant. // Note that the relocation placement code relies on this filtering for // correctness as it expects the base to be in the liveset, which isn't true // if the base is constant. for (auto &Info : Records) for (auto &BasePair : Info.PointerToBase) if (isa(BasePair.second)) Info.LiveSet.erase(BasePair.first); for (CallInst *CI : Holders) CI->eraseFromParent(); Holders.clear(); // Do a limited scalarization of any live at safepoint vector values which // contain pointers. This enables this pass to run after vectorization at // the cost of some possible performance loss. Note: This is known to not // handle updating of the side tables correctly which can lead to relocation // bugs when the same vector is live at multiple statepoints. We're in the // process of implementing the alternate lowering - relocating the // vector-of-pointers as first class item and updating the backend to // understand that - but that's not yet complete. if (UseVectorSplit) for (size_t i = 0; i < Records.size(); i++) { PartiallyConstructedSafepointRecord &Info = Records[i]; Instruction *Statepoint = ToUpdate[i].getInstruction(); splitVectorValues(cast(Statepoint), Info.LiveSet, Info.PointerToBase, DT); } // In order to reduce live set of statepoint we might choose to rematerialize // some values instead of relocating them. This is purely an optimization and // does not influence correctness. for (size_t i = 0; i < Records.size(); i++) rematerializeLiveValues(ToUpdate[i], Records[i], TTI); // We need this to safely RAUW and delete call or invoke return values that // may themselves be live over a statepoint. For details, please see usage in // makeStatepointExplicitImpl. std::vector Replacements; // Now run through and replace the existing statepoints with new ones with // the live variables listed. We do not yet update uses of the values being // relocated. We have references to live variables that need to // survive to the last iteration of this loop. (By construction, the // previous statepoint can not be a live variable, thus we can and remove // the old statepoint calls as we go.) for (size_t i = 0; i < Records.size(); i++) makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); ToUpdate.clear(); // prevent accident use of invalid CallSites for (auto &PR : Replacements) PR.doReplacement(); Replacements.clear(); for (auto &Info : Records) { // These live sets may contain state Value pointers, since we replaced calls // with operand bundles with calls wrapped in gc.statepoint, and some of // those calls may have been def'ing live gc pointers. Clear these out to // avoid accidentally using them. // // TODO: We should create a separate data structure that does not contain // these live sets, and migrate to using that data structure from this point // onward. Info.LiveSet.clear(); Info.PointerToBase.clear(); } // Do all the fixups of the original live variables to their relocated selves SmallVector Live; for (size_t i = 0; i < Records.size(); i++) { PartiallyConstructedSafepointRecord &Info = Records[i]; // We can't simply save the live set from the original insertion. One of // the live values might be the result of a call which needs a safepoint. // That Value* no longer exists and we need to use the new gc_result. // Thankfully, the live set is embedded in the statepoint (and updated), so // we just grab that. Statepoint Statepoint(Info.StatepointToken); Live.insert(Live.end(), Statepoint.gc_args_begin(), Statepoint.gc_args_end()); #ifndef NDEBUG // Do some basic sanity checks on our liveness results before performing // relocation. Relocation can and will turn mistakes in liveness results // into non-sensical code which is must harder to debug. // TODO: It would be nice to test consistency as well assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && "statepoint must be reachable or liveness is meaningless"); for (Value *V : Statepoint.gc_args()) { if (!isa(V)) // Non-instruction values trivial dominate all possible uses continue; auto *LiveInst = cast(V); assert(DT.isReachableFromEntry(LiveInst->getParent()) && "unreachable values should never be live"); assert(DT.dominates(LiveInst, Info.StatepointToken) && "basic SSA liveness expectation violated by liveness analysis"); } #endif } unique_unsorted(Live); #ifndef NDEBUG // sanity check for (auto *Ptr : Live) assert(isHandledGCPointerType(Ptr->getType()) && "must be a gc pointer type"); #endif relocationViaAlloca(F, DT, Live, Records); return !Records.empty(); } // Handles both return values and arguments for Functions and CallSites. template static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, unsigned Index) { AttrBuilder R; if (AH.getDereferenceableBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, AH.getDereferenceableBytes(Index))); if (AH.getDereferenceableOrNullBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, AH.getDereferenceableOrNullBytes(Index))); if (AH.doesNotAlias(Index)) R.addAttribute(Attribute::NoAlias); if (!R.empty()) AH.setAttributes(AH.getAttributes().removeAttributes( Ctx, Index, AttributeSet::get(Ctx, Index, R))); } void RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { LLVMContext &Ctx = F.getContext(); for (Argument &A : F.args()) if (isa(A.getType())) RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1); if (isa(F.getReturnType())) RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); } void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = MD->getNumOperands() == 4 && mdconst::extract(MD->getOperand(3))->getValue() == 1; if (!IsImmutableTBAA) continue; // no work to do, MD_tbaa is already marked mutable MDNode *Base = cast(MD->getOperand(0)); MDNode *Access = cast(MD->getOperand(1)); uint64_t Offset = mdconst::extract(MD->getOperand(2))->getZExtValue(); MDNode *MutableTBAA = Builder.createTBAAStructTagNode(Base, Access, Offset); I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); } if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa(CS.getArgument(i)->getType())) RemoveNonValidAttrAtIndex(Ctx, CS, i + 1); if (isa(CS.getType())) RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); } } } /// Returns true if this function should be rewritten by this pass. The main /// point of this function is as an extension point for custom logic. static bool shouldRewriteStatepointsIn(Function &F) { // TODO: This should check the GCStrategy if (F.hasGC()) { const auto &FunctionGCName = F.getGC(); const StringRef StatepointExampleName("statepoint-example"); const StringRef CoreCLRName("coreclr"); return (StatepointExampleName == FunctionGCName) || (CoreCLRName == FunctionGCName); } else return false; } void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { #ifndef NDEBUG assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) && "precondition!"); #endif for (Function &F : M) stripNonValidAttributesFromPrototype(F); for (Function &F : M) stripNonValidAttributesFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { // Nothing to do for declarations. if (F.isDeclaration() || F.empty()) return false; // Policy choice says not to rewrite - the most common reason is that we're // compiling code without a GCStrategy. if (!shouldRewriteStatepointsIn(F)) return false; DominatorTree &DT = getAnalysis(F).getDomTree(); TargetTransformInfo &TTI = getAnalysis().getTTI(F); auto NeedsRewrite = [](Instruction &I) { if (UseDeoptBundles) { if (ImmutableCallSite CS = ImmutableCallSite(&I)) return !callsGCLeafFunction(CS); return false; } return isStatepoint(I); }; // Gather all the statepoints which need rewritten. Be careful to only // consider those in reachable code since we need to ask dominance queries // when rewriting. We'll delete the unreachable ones in a moment. SmallVector ParsePointNeeded; bool HasUnreachableStatepoint = false; for (Instruction &I : instructions(F)) { // TODO: only the ones with the flag set! if (NeedsRewrite(I)) { if (DT.isReachableFromEntry(I.getParent())) ParsePointNeeded.push_back(CallSite(&I)); else HasUnreachableStatepoint = true; } } bool MadeChange = false; // Delete any unreachable statepoints so that we don't have unrewritten // statepoints surviving this pass. This makes testing easier and the // resulting IR less confusing to human readers. Rather than be fancy, we // just reuse a utility function which removes the unreachable blocks. if (HasUnreachableStatepoint) MadeChange |= removeUnreachableBlocks(F); // Return early if no work to do. if (ParsePointNeeded.empty()) return MadeChange; // As a prepass, go ahead and aggressively destroy single entry phi nodes. // These are created by LCSSA. They have the effect of increasing the size // of liveness sets for no good reason. It may be harder to do this post // insertion since relocations and base phis can confuse things. for (BasicBlock &BB : F) if (BB.getUniquePredecessor()) { MadeChange = true; FoldSingleEntryPHINodes(&BB); } // Before we start introducing relocations, we want to tweak the IR a bit to // avoid unfortunate code generation effects. The main example is that we // want to try to make sure the comparison feeding a branch is after any // safepoints. Otherwise, we end up with a comparison of pre-relocation // values feeding a branch after relocation. This is semantically correct, // but results in extra register pressure since both the pre-relocation and // post-relocation copies must be available in registers. For code without // relocations this is handled elsewhere, but teaching the scheduler to // reverse the transform we're about to do would be slightly complex. // Note: This may extend the live range of the inputs to the icmp and thus // increase the liveset of any statepoint we move over. This is profitable // as long as all statepoints are in rare blocks. If we had in-register // lowering for live values this would be a much safer transform. auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { if (auto *BI = dyn_cast(TI)) if (BI->isConditional()) return dyn_cast(BI->getCondition()); // TODO: Extend this to handle switches return nullptr; }; for (BasicBlock &BB : F) { TerminatorInst *TI = BB.getTerminator(); if (auto *Cond = getConditionInst(TI)) // TODO: Handle more than just ICmps here. We should be able to move // most instructions without side effects or memory access. if (isa(Cond) && Cond->hasOneUse()) { MadeChange = true; Cond->moveBefore(TI); } } MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded); return MadeChange; } // liveness computation via standard dataflow // ------------------------------------------------------------------- // TODO: Consider using bitvectors for liveness, the set of potentially // interesting values should be small and easy to pre-compute. /// Compute the live-in set for the location rbegin starting from /// the live-out set of the basic block static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, BasicBlock::reverse_iterator rend, DenseSet &LiveTmp) { for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) { Instruction *I = &*ritr; // KILL/Def - Remove this definition from LiveIn LiveTmp.erase(I); // Don't consider *uses* in PHI nodes, we handle their contribution to // predecessor blocks when we seed the LiveOut sets if (isa(I)) continue; // USE - Add to the LiveIn set for this instruction for (Value *V : I->operands()) { assert(!isUnhandledGCPointerType(V->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa(V)) { // The choice to exclude all things constant here is slightly subtle. // There are two independent reasons: // - We assume that things which are constant (from LLVM's definition) // do not move at runtime. For example, the address of a global // variable is fixed, even though it's contents may not be. // - Second, we can't disallow arbitrary inttoptr constants even // if the language frontend does. Optimization passes are free to // locally exploit facts without respect to global reachability. This // can create sections of code which are dynamically unreachable and // contain just about anything. (see constants.ll in tests) LiveTmp.insert(V); } } } } static void computeLiveOutSeed(BasicBlock *BB, DenseSet &LiveTmp) { for (BasicBlock *Succ : successors(BB)) { const BasicBlock::iterator E(Succ->getFirstNonPHI()); for (BasicBlock::iterator I = Succ->begin(); I != E; I++) { PHINode *Phi = cast(&*I); Value *V = Phi->getIncomingValueForBlock(BB); assert(!isUnhandledGCPointerType(V->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa(V)) { LiveTmp.insert(V); } } } } static DenseSet computeKillSet(BasicBlock *BB) { DenseSet KillSet; for (Instruction &I : *BB) if (isHandledGCPointerType(I.getType())) KillSet.insert(&I); return KillSet; } #ifndef NDEBUG /// Check that the items in 'Live' dominate 'TI'. This is used as a basic /// sanity check for the liveness computation. static void checkBasicSSA(DominatorTree &DT, DenseSet &Live, TerminatorInst *TI, bool TermOkay = false) { for (Value *V : Live) { if (auto *I = dyn_cast(V)) { // The terminator can be a member of the LiveOut set. LLVM's definition // of instruction dominance states that V does not dominate itself. As // such, we need to special case this to allow it. if (TermOkay && TI == I) continue; assert(DT.dominates(I, TI) && "basic SSA liveness expectation violated by liveness analysis"); } } } /// Check that all the liveness sets used during the computation of liveness /// obey basic SSA properties. This is useful for finding cases where we miss /// a def. static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data, BasicBlock &BB) { checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator()); checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true); checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator()); } #endif static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data) { SmallSetVector Worklist; auto AddPredsToWorklist = [&](BasicBlock *BB) { // We use a SetVector so that we don't have duplicates in the worklist. Worklist.insert(pred_begin(BB), pred_end(BB)); }; auto NextItem = [&]() { BasicBlock *BB = Worklist.back(); Worklist.pop_back(); return BB; }; // Seed the liveness for each individual block for (BasicBlock &BB : F) { Data.KillSet[&BB] = computeKillSet(&BB); Data.LiveSet[&BB].clear(); computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]); #ifndef NDEBUG for (Value *Kill : Data.KillSet[&BB]) assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill"); #endif Data.LiveOut[&BB] = DenseSet(); computeLiveOutSeed(&BB, Data.LiveOut[&BB]); Data.LiveIn[&BB] = Data.LiveSet[&BB]; set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]); set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]); if (!Data.LiveIn[&BB].empty()) AddPredsToWorklist(&BB); } // Propagate that liveness until stable while (!Worklist.empty()) { BasicBlock *BB = NextItem(); // Compute our new liveout set, then exit early if it hasn't changed // despite the contribution of our successor. DenseSet LiveOut = Data.LiveOut[BB]; const auto OldLiveOutSize = LiveOut.size(); for (BasicBlock *Succ : successors(BB)) { assert(Data.LiveIn.count(Succ)); set_union(LiveOut, Data.LiveIn[Succ]); } // assert OutLiveOut is a subset of LiveOut if (OldLiveOutSize == LiveOut.size()) { // If the sets are the same size, then we didn't actually add anything // when unioning our successors LiveIn Thus, the LiveIn of this block // hasn't changed. continue; } Data.LiveOut[BB] = LiveOut; // Apply the effects of this basic block DenseSet LiveTmp = LiveOut; set_union(LiveTmp, Data.LiveSet[BB]); set_subtract(LiveTmp, Data.KillSet[BB]); assert(Data.LiveIn.count(BB)); const DenseSet &OldLiveIn = Data.LiveIn[BB]; // assert: OldLiveIn is a subset of LiveTmp if (OldLiveIn.size() != LiveTmp.size()) { Data.LiveIn[BB] = LiveTmp; AddPredsToWorklist(BB); } } // while( !worklist.empty() ) #ifndef NDEBUG // Sanity check our output against SSA properties. This helps catch any // missing kills during the above iteration. for (BasicBlock &BB : F) { checkBasicSSA(DT, Data, BB); } #endif } static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, StatepointLiveSetTy &Out) { BasicBlock *BB = Inst->getParent(); // Note: The copy is intentional and required assert(Data.LiveOut.count(BB)); DenseSet LiveOut = Data.LiveOut[BB]; // We want to handle the statepoint itself oddly. It's // call result is not live (normal), nor are it's arguments // (unless they're used again later). This adjustment is // specifically what we need to relocate BasicBlock::reverse_iterator rend(Inst->getIterator()); computeLiveInValues(BB->rbegin(), rend, LiveOut); LiveOut.erase(Inst); Out.insert(LiveOut.begin(), LiveOut.end()); } static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, const CallSite &CS, PartiallyConstructedSafepointRecord &Info) { Instruction *Inst = CS.getInstruction(); StatepointLiveSetTy Updated; findLiveSetAtInst(Inst, RevisedLivenessData, Updated); #ifndef NDEBUG DenseSet Bases; for (auto KVPair : Info.PointerToBase) { Bases.insert(KVPair.second); } #endif // We may have base pointers which are now live that weren't before. We need // to update the PointerToBase structure to reflect this. for (auto V : Updated) if (!Info.PointerToBase.count(V)) { assert(Bases.count(V) && "can't find base for unexpected live value"); Info.PointerToBase[V] = V; continue; } #ifndef NDEBUG for (auto V : Updated) { assert(Info.PointerToBase.count(V) && "must be able to find base for live value"); } #endif // Remove any stale base mappings - this can happen since our liveness is // more precise then the one inherent in the base pointer analysis DenseSet ToErase; for (auto KVPair : Info.PointerToBase) if (!Updated.count(KVPair.first)) ToErase.insert(KVPair.first); for (auto V : ToErase) Info.PointerToBase.erase(V); #ifndef NDEBUG for (auto KVPair : Info.PointerToBase) assert(Updated.count(KVPair.first) && "record for non-live value"); #endif Info.LiveSet = Updated; } diff --git a/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 774538eaa814..c5241ce13566 100644 --- a/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -1,2621 +1,2619 @@ //===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This transformation implements the well known scalar replacement of // aggregates transformation. This xform breaks up alloca instructions of // aggregate type (structure or array) into individual alloca instructions for // each member (if possible). Then, if possible, it transforms the individual // alloca instructions into nice clean scalar SSA form. // // This combines a simple SRoA algorithm with the Mem2Reg algorithm because they // often interact, especially for C++ programs. As such, iterating between // SRoA, then Mem2Reg until we run out of things to promote works well. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; #define DEBUG_TYPE "scalarrepl" STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); namespace { #define SROA SROA_ struct SROA : public FunctionPass { SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { if (T == -1) SRThreshold = 128; else SRThreshold = T; if (ST == -1) StructMemberThreshold = 32; else StructMemberThreshold = ST; if (AT == -1) ArrayElementThreshold = 8; else ArrayElementThreshold = AT; if (SLT == -1) // Do not limit the scalar integer load size if no threshold is given. ScalarLoadThreshold = -1; else ScalarLoadThreshold = SLT; } bool runOnFunction(Function &F) override; bool performScalarRepl(Function &F); bool performPromotion(Function &F); private: bool HasDomTree; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. SmallVector DeadInsts; /// AllocaInfo - When analyzing uses of an alloca instruction, this captures /// information about the uses. All these fields are initialized to false /// and set to true when something is learned. struct AllocaInfo { /// The alloca to promote. AllocaInst *AI; /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite /// looping and avoid redundant work. SmallPtrSet CheckedPHIs; /// isUnsafe - This is set to true if the alloca cannot be SROA'd. bool isUnsafe : 1; /// isMemCpySrc - This is true if this aggregate is memcpy'd from. bool isMemCpySrc : 1; /// isMemCpyDst - This is true if this aggregate is memcpy'd into. bool isMemCpyDst : 1; /// hasSubelementAccess - This is true if a subelement of the alloca is /// ever accessed, or false if the alloca is only accessed with mem /// intrinsics or load/store that only access the entire alloca at once. bool hasSubelementAccess : 1; /// hasALoadOrStore - This is true if there are any loads or stores to it. /// The alloca may just be accessed with memcpy, for example, which would /// not set this. bool hasALoadOrStore : 1; explicit AllocaInfo(AllocaInst *ai) : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), hasSubelementAccess(false), hasALoadOrStore(false) {} }; /// SRThreshold - The maximum alloca size to considered for SROA. unsigned SRThreshold; /// StructMemberThreshold - The maximum number of members a struct can /// contain to be considered for SROA. unsigned StructMemberThreshold; /// ArrayElementThreshold - The maximum number of elements an array can /// have to be considered for SROA. unsigned ArrayElementThreshold; /// ScalarLoadThreshold - The maximum size in bits of scalars to load when /// converting to scalar unsigned ScalarLoadThreshold; void MarkUnsafe(AllocaInfo &I, Instruction *User) { I.isUnsafe = true; DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); } bool isSafeAllocaToScalarRepl(AllocaInst *AI); void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, AllocaInfo &Info); void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess); bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, const DataLayout &DL); uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, const DataLayout &DL); void DoScalarReplacement(AllocaInst *AI, std::vector &WorkList); void DeleteDeadInstructions(); void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVectorImpl &NewElts); void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, SmallVectorImpl &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVectorImpl &NewElts); bool ShouldAttemptScalarRepl(AllocaInst *AI); }; // SROA_DT - SROA that uses DominatorTree. struct SROA_DT : public SROA { static char ID; public: SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : SROA(T, true, ID, ST, AT, SLT) { initializeSROA_DTPass(*PassRegistry::getPassRegistry()); } // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); } }; // SROA_SSAUp - SROA that uses SSAUpdater. struct SROA_SSAUp : public SROA { static char ID; public: SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : SROA(T, false, ID, ST, AT, SLT) { initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); } // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); } }; } char SROA_DT::ID = 0; char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) // Public interface to the ScalarReplAggregates pass FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, bool UseDomTree, int StructMemberThreshold, int ArrayElementThreshold, int ScalarLoadThreshold) { if (UseDomTree) return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, ScalarLoadThreshold); return new SROA_SSAUp(Threshold, StructMemberThreshold, ArrayElementThreshold, ScalarLoadThreshold); } //===----------------------------------------------------------------------===// // Convert To Scalar Optimization. //===----------------------------------------------------------------------===// namespace { /// ConvertToScalarInfo - This class implements the "Convert To Scalar" /// optimization, which scans the uses of an alloca and determines if it can /// rewrite it in terms of a single new alloca that can be mem2reg'd. class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; const DataLayout &DL; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object /// which means that mem2reg can't promote it. bool IsNotTrivial; /// ScalarKind - Tracks the kind of alloca being considered for promotion, /// computed based on the uses of the alloca rather than the LLVM type system. enum { Unknown, // Accesses via GEPs that are consistent with element access of a vector // type. This will not be converted into a vector unless there is a later // access using an actual vector type. ImplicitVector, // Accesses via vector operations and GEPs that are consistent with the // layout of a vector type. Vector, // An integer bag-of-bits with bitwise operations for insertion and // extraction. Any combination of types can be converted into this kind // of scalar. Integer } ScalarKind; /// VectorTy - This tracks the type that we should promote the vector to if /// it is possible to turn it into a vector. This starts out null, and if it /// isn't possible to turn into a vector type, it gets set to VoidTy. VectorType *VectorTy; /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into /// large integers unless there is some potential for optimization. bool HadNonMemTransferAccess; /// HadDynamicAccess - True if some element of this alloca was dynamic. /// We don't yet have support for turning a dynamic access into a large /// integer. bool HadDynamicAccess; public: explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL, unsigned SLT) : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false), ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false), HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); private: bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value *NonConstantIdx); Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder); }; } // end anonymous namespace. /// TryConvert - Analyze the specified alloca, and if it is safe to do so, /// rewrite it to be a new alloca which is mem2reg'able. This returns the new /// alloca if possible or null if not. AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial) return nullptr; // If an alloca has only memset / memcpy uses, it may still have an Unknown // ScalarKind. Treat it as an Integer below. if (ScalarKind == Unknown) ScalarKind = Integer; if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) ScalarKind = Integer; // If we were able to find a vector type that can handle this with // insert/extract elements, and if there was at least one use that had // a vector type, promote this to a vector. We don't want to promote // random stuff that doesn't use vectors (e.g. <9 x double>) because then // we just get a lot of insert/extracts. If at least one vector is // involved, then we probably really do have a union of vector/array. Type *NewTy; if (ScalarKind == Vector) { assert(VectorTy && "Missing type for vector scalar."); DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " << *VectorTy << '\n'); NewTy = VectorTy; // Use the vector type. } else { unsigned BitWidth = AllocaSize * 8; // Do not convert to scalar integer if the alloca size exceeds the // scalar load threshold. if (BitWidth > ScalarLoadThreshold) return nullptr; if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth)) return nullptr; // Dynamic accesses on integers aren't yet supported. They need us to shift // by a dynamic amount which could be difficult to work out as we might not // know whether to use a left or right shift. if (ScalarKind == Integer && HadDynamicAccess) return nullptr; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } /// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type /// (VectorTy) so far at the offset specified by Offset (which is specified in /// bytes). /// /// There are two cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. /// Here we turn element accesses into insert/extract element operations. /// This promotes a <4 x float> with a store of float to the third element /// into a <4 x float> that uses insert element. /// 2) A fully general blob of memory, which we turn into some (potentially /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, uint64_t Offset) { // If we already decided to turn this into a blob of integer memory, there is // nothing to be done. if (ScalarKind == Integer) return; // If this could be contributing to a vector, analyze it. // If the In type is a vector that is the same size as the alloca, see if it // matches the existing VecTy. if (VectorType *VInTy = dyn_cast(In)) { if (MergeInVectorType(VInTy, Offset)) return; } else if (In->isFloatTy() || In->isDoubleTy() || (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && isPowerOf2_32(In->getPrimitiveSizeInBits()))) { // Full width accesses can be ignored, because they can always be turned // into bitcasts. unsigned EltSize = In->getPrimitiveSizeInBits()/8; if (EltSize == AllocaSize) return; // If we're accessing something that could be an element of a vector, see // if the implied vector agrees with what we already have and if Offset is // compatible with it. if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && (!VectorTy || EltSize == VectorTy->getElementType() ->getPrimitiveSizeInBits()/8)) { if (!VectorTy) { ScalarKind = ImplicitVector; VectorTy = VectorType::get(In, AllocaSize/EltSize); } return; } } // Otherwise, we have a case that we can't handle with an optimized vector // form. We can still turn this into a large integer. ScalarKind = Integer; } /// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, /// returning true if the type was successfully merged and false otherwise. bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, uint64_t Offset) { if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { // If we're storing/loading a vector of the right size, allow it as a // vector. If this the first vector we see, remember the type so that // we know the element size. If this is a subsequent access, ignore it // even if it is a differing type but the same size. Worst case we can // bitcast the resultant vectors. if (!VectorTy) VectorTy = VInTy; ScalarKind = Vector; return true; } return false; } /// CanConvertToScalar - V is a pointer. If we can convert the pointee and all /// its accesses to a single vector type, return true and set VecTy to /// the new type. If we could convert the alloca into a single promotable /// integer, return true but set VecTy to VoidTy. Further, if the use is not a /// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset /// is the current offset from the base of the alloca being analyzed. /// /// If we see at least one access to the value that is as a vector type, set the /// SawVec flag. bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx) { for (User *U : V->users()) { Instruction *UI = cast(U); if (LoadInst *LI = dyn_cast(UI)) { // Don't break volatile loads. if (!LI->isSimple()) return false; // Don't touch MMX operations. if (LI->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; MergeInTypeForLoadOrStore(LI->getType(), Offset); continue; } if (StoreInst *SI = dyn_cast(UI)) { // Storing the pointer, not into the value? if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. if (SI->getOperand(0)->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset); continue; } if (BitCastInst *BCI = dyn_cast(UI)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) return false; continue; } if (GetElementPtrInst *GEP = dyn_cast(UI)) { // If this is a GEP with a variable indices, we can't handle it. PointerType* PtrTy = dyn_cast(GEP->getPointerOperandType()); if (!PtrTy) return false; // Compute the offset that this GEP adds to the pointer. SmallVector Indices(GEP->op_begin()+1, GEP->op_end()); Value *GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { if (!isa(PtrTy->getElementType())) return false; if (NonConstantIdx) return false; GEPNonConstantIdx = Indices.pop_back_val(); if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) return false; HadDynamicAccess = true; } else GEPNonConstantIdx = NonConstantIdx; uint64_t GEPOffset = DL.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) return false; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; continue; } // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. if (MemSetInst *MSI = dyn_cast(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; // Store of constant value. if (!isa(MSI->getValue())) return false; // Store of constant size. ConstantInt *Len = dyn_cast(MSI->getLength()); if (!Len) return false; // If the size differs from the alloca, we can only convert the alloca to // an integer bag-of-bits. // FIXME: This should handle all of the cases that are currently accepted // as vector element insertions. if (Len->getZExtValue() != AllocaSize || Offset != 0) ScalarKind = Integer; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; continue; } // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; ConstantInt *Len = dyn_cast(MTI->getLength()); if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0) return false; IsNotTrivial = true; // Can't be mem2reg'd. continue; } // If this is a lifetime intrinsic, we can handle it. if (IntrinsicInst *II = dyn_cast(UI)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { continue; } } // Otherwise, we cannot handle this! return false; } return true; } /// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca /// directly. This happens when we are converting an "integer union" to a /// single integer scalar, or when we are converting a "vector union" to a /// vector with insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. By the end of this, there should be no uses of Ptr. void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value* NonConstantIdx) { while (!Ptr->use_empty()) { Instruction *User = cast(Ptr->user_back()); if (BitCastInst *CI = dyn_cast(User)) { ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); CI->eraseFromParent(); continue; } if (GetElementPtrInst *GEP = dyn_cast(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector Indices(GEP->op_begin()+1, GEP->op_end()); Value* GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { assert(!NonConstantIdx && "Dynamic GEP reading from dynamic GEP unsupported"); GEPNonConstantIdx = Indices.pop_back_val(); } else GEPNonConstantIdx = NonConstantIdx; uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(), Indices); ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); GEP->eraseFromParent(); continue; } IRBuilder<> Builder(User); if (LoadInst *LI = dyn_cast(User)) { // The load is a bit extract from NewAI shifted right by Offset bits. Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, NonConstantIdx, Builder); LI->replaceAllUsesWith(NewLoadVal); LI->eraseFromParent(); continue; } if (StoreInst *SI = dyn_cast(User)) { assert(SI->getOperand(0) != Ptr && "Consistency error!"); Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, NonConstantIdx, Builder); Builder.CreateStore(New, NewAI); SI->eraseFromParent(); // If the load we just inserted is now dead, then the inserted store // overwrote the entire thing. if (Old->use_empty()) Old->eraseFromParent(); continue; } // If this is a constant sized memset of a constant value (e.g. 0) we can // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast(User)) { assert(MSI->getRawDest() == Ptr && "Consistency error!"); assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); int64_t SNumBytes = cast(MSI->getLength())->getSExtValue(); if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { unsigned NumBytes = static_cast(SNumBytes); unsigned Val = cast(MSI->getValue())->getZExtValue(); // Compute the value replicated the right number of times. APInt APVal(NumBytes*8, Val); // Splat the value if non-zero. if (Val) for (unsigned i = 1; i != NumBytes; ++i) APVal |= APVal << 8; Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), Old, Offset, nullptr, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote // the entire thing. if (Old->use_empty()) Old->eraseFromParent(); } MSI->eraseFromParent(); continue; } // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast(User)) { assert(Offset == 0 && "must be store to start of alloca"); assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. AllocaInst *OrigAI = cast(GetUnderlyingObject(Ptr, DL, 0)); if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); Value *SrcPtr = MTI->getSource(); PointerType* SPTy = cast(SrcPtr->getType()); PointerType* AIPTy = cast(NewAI->getType()); if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { - AIPTy = PointerType::get(AIPTy->getElementType(), + AIPTy = PointerType::get(NewAI->getAllocatedType(), SPTy->getAddressSpace()); } SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); PointerType* DPTy = cast(MTI->getDest()->getType()); PointerType* AIPTy = cast(NewAI->getType()); if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { - AIPTy = PointerType::get(AIPTy->getElementType(), + AIPTy = PointerType::get(NewAI->getAllocatedType(), DPTy->getAddressSpace()); } Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); NewStore->setAlignment(MTI->getAlignment()); } else { // Noop transfer. Src == Dst } MTI->eraseFromParent(); continue; } if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { // There's no need to preserve these, as the resulting alloca will be // converted to a register anyways. II->eraseFromParent(); continue; } } llvm_unreachable("Unsupported operation!"); } } /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer /// or vector value FromVal, extracting the bits from the offset specified by /// Offset. This returns the value, which is of type ToType. /// /// This happens when we are converting an "integer union" to a single /// integer scalar, or when we are converting a "vector union" to a vector with /// insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. Value *ConvertToScalarInfo:: ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. Type *FromType = FromVal->getType(); if (FromType == ToType && Offset == 0) return FromVal; // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. if (VectorType *VTy = dyn_cast(FromType)) { unsigned FromTypeSize = DL.getTypeAllocSize(FromType); unsigned ToTypeSize = DL.getTypeAllocSize(ToType); if (FromTypeSize == ToTypeSize) return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; if (Offset) { unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); Elt = Offset/EltSize; assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. Value *Idx; if (NonConstantIdx) { if (Elt) Idx = Builder.CreateAdd(NonConstantIdx, Builder.getInt32(Elt), "dyn.offset"); else Idx = NonConstantIdx; } else Idx = Builder.getInt32(Elt); Value *V = Builder.CreateExtractElement(FromVal, Idx); if (V->getType() != ToType) V = Builder.CreateBitCast(V, ToType); return V; } // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. if (StructType *ST = dyn_cast(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *DL.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } if (ArrayType *AT = dyn_cast(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), Offset+i*EltSize, nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } // Otherwise, this must be a union that was converted to an integer value. IntegerType *NTy = cast(FromVal->getType()); // If this is a big-endian system and the load is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. ShAmt = DL.getTypeStoreSizeInBits(NTy) - DL.getTypeStoreSizeInBits(ToType) - Offset; } else { ShAmt = Offset; } // Note: we support negative bitwidths (with shl) which are not defined. // We do this to support (f.e.) loads off the end of a structure where // only some bits are used. if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateLShr(FromVal, ConstantInt::get(FromVal->getType(), ShAmt)); else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateShl(FromVal, ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. unsigned LIBitWidth = DL.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth)); else if (LIBitWidth > NTy->getBitWidth()) FromVal = Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth)); // If the result is an integer, this is a trunc or bitcast. if (ToType->isIntegerTy()) { // Should be done. } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { // Just do a bitcast, we know the sizes match up. FromVal = Builder.CreateBitCast(FromVal, ToType); } else { // Otherwise must be a pointer. FromVal = Builder.CreateIntToPtr(FromVal, ToType); } assert(FromVal->getType() == ToType && "Didn't convert right?"); return FromVal; } /// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer /// or vector value "Old" at the offset specified by Offset. /// /// This happens when we are converting an "integer union" to a /// single integer scalar, or when we are converting a "vector union" to a /// vector with insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. /// /// NonConstantIdx is an index value if there was a GEP with a non-constant /// index value. If this is 0 then all GEPs used to find this insert address /// are constant. Value *ConvertToScalarInfo:: ConvertScalar_InsertValue(Value *SV, Value *Old, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder) { // Convert the stored type to the actual type, shift it left to insert // then 'or' into place. Type *AllocaType = Old->getType(); LLVMContext &Context = Old->getContext(); if (VectorType *VTy = dyn_cast(AllocaType)) { uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy); uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType()); // Changing the whole vector with memset or with an access of a different // vector type? if (ValSize == VecSize) return Builder.CreateBitCast(SV, AllocaType); // Must be an element insertion. Type *EltTy = VTy->getElementType(); if (SV->getType() != EltTy) SV = Builder.CreateBitCast(SV, EltTy); uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; Value *Idx; if (NonConstantIdx) { if (Elt) Idx = Builder.CreateAdd(NonConstantIdx, Builder.getInt32(Elt), "dyn.offset"); else Idx = NonConstantIdx; } else Idx = Builder.getInt32(Elt); return Builder.CreateInsertElement(Old, SV, Idx); } // If SV is a first-class aggregate value, insert each value recursively. if (StructType *ST = dyn_cast(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *DL.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), nullptr, Builder); } return Old; } if (ArrayType *AT = dyn_cast(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr, Builder); } return Old; } // If SV is a float, convert it to the appropriate integer type. // If it is a pointer, do the same. unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType()); unsigned DestWidth = DL.getTypeSizeInBits(AllocaType); unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType()); unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { if (SV->getType()->getPrimitiveSizeInBits() < AllocaType->getPrimitiveSizeInBits()) SV = Builder.CreateZExt(SV, AllocaType); else { // Truncation may be needed if storing more than the alloca can hold // (undefined behavior). SV = Builder.CreateTrunc(SV, AllocaType); SrcWidth = DestWidth; SrcStoreWidth = DestStoreWidth; } } // If this is a big-endian system and the store is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. ShAmt = DestStoreWidth - SrcStoreWidth - Offset; } else { ShAmt = Offset; } // Note: we support negative bitwidths (with shr) which are not defined. // We do this to support (f.e.) stores off the end of a structure where // only some bits in the structure are set. APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); Mask <<= ShAmt; } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); Mask = Mask.lshr(-ShAmt); } // Mask out the bits we are about to insert from the old value, and or // in the new bits. if (SrcWidth != DestWidth) { assert(DestWidth > SrcWidth); Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); SV = Builder.CreateOr(Old, SV, "ins"); } return SV; } //===----------------------------------------------------------------------===// // SRoA Driver //===----------------------------------------------------------------------===// bool SROA::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; bool Changed = performPromotion(F); while (1) { bool LocalChange = performScalarRepl(F); if (!LocalChange) break; // No need to repromote if no scalarrepl Changed = true; LocalChange = performPromotion(F); if (!LocalChange) break; // No need to re-scalarrepl if no promotion } return Changed; } namespace { class AllocaPromoter : public LoadAndStorePromoter { AllocaInst *AI; DIBuilder *DIB; SmallVector DDIs; SmallVector DVIs; public: AllocaPromoter(ArrayRef Insts, SSAUpdater &S, DIBuilder *DB) : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} void run(AllocaInst *AI, const SmallVectorImpl &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; if (auto *L = LocalAsMetadata::getIfExists(AI)) { if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) { for (User *U : DINode->users()) if (DbgDeclareInst *DDI = dyn_cast(U)) DDIs.push_back(DDI); else if (DbgValueInst *DVI = dyn_cast(U)) DVIs.push_back(DVI); } } LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); for (SmallVectorImpl::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } for (SmallVectorImpl::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); } } bool isInstInList(Instruction *I, const SmallVectorImpl &Insts) const override { if (LoadInst *LI = dyn_cast(I)) return LI->getOperand(0) == AI; return cast(I)->getPointerOperand() == AI; } void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast(Inst)) ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); else if (LoadInst *LI = dyn_cast(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } for (SmallVectorImpl::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = nullptr; if (StoreInst *SI = dyn_cast(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. if (ZExtInst *ZExt = dyn_cast(SI->getOperand(0))) Arg = dyn_cast(ZExt->getOperand(0)); if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) Arg = dyn_cast(SExt->getOperand(0)); if (!Arg) Arg = SI->getOperand(0); } else if (LoadInst *LI = dyn_cast(Inst)) { Arg = LI->getOperand(0); } else { continue; } DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), DVI->getExpression(), DVI->getDebugLoc(), Inst); } } }; } // end anon namespace /// isSafeSelectToSpeculate - Select instructions that use an alloca and are /// subsequently loaded can be rewritten to load both input pointers and then /// select between the result, allowing the load of the alloca to be promoted. /// From this: /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// %V2 = load i32* %Other /// %V = select i1 %cond, i32 %V1, i32 %V2 /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafeSelectToSpeculate(SelectInst *SI) { for (User *U : SI->users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. if (!isSafeToLoadUnconditionally(SI->getTrueValue(), LI->getAlignment(), LI)) return false; if (!isSafeToLoadUnconditionally(SI->getFalseValue(), LI->getAlignment(), LI)) return false; } return true; } /// isSafePHIToSpeculate - PHI instructions that use an alloca and are /// subsequently loaded can be rewritten to load both input pointers in the pred /// blocks and then PHI the results, allowing the load of the alloca to be /// promoted. /// From this: /// %P2 = phi [i32* %Alloca, i32* %Other] /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// ... /// %V2 = load i32* %Other /// ... /// %V = phi [i32 %V1, i32 %V2] /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafePHIToSpeculate(PHINode *PN) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN->getParent(); unsigned MaxAlign = 0; for (User *U : PN->users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. if (LI->getParent() != BB) return false; // Ensure that there are no instructions between the PHI and the load that // could store. for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; MaxAlign = std::max(MaxAlign, LI->getAlignment()); } // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly // trapping load in the predecessor if it is a critical edge. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); Value *InVal = PN->getIncomingValue(i); // If the terminator of the predecessor has side-effects (an invoke), // there is no safe place to put a load in the predecessor. if (Pred->getTerminator()->mayHaveSideEffects()) return false; // If the value is produced by the terminator of the predecessor // (an invoke), there is no valid place to put a load in the predecessor. if (Pred->getTerminator() == InVal) return false; // If the predecessor has a single successor, then the edge isn't critical. if (Pred->getTerminator()->getNumSuccessors() == 1) continue; // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. if (isSafeToLoadUnconditionally(InVal, MaxAlign, Pred->getTerminator())) continue; return false; } return true; } /// tryToMakeAllocaBePromotable - This returns true if the alloca only has /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) { SetVector, SmallPtrSet > InstsToRewrite; for (User *U : AI->users()) { if (LoadInst *LI = dyn_cast(U)) { if (!LI->isSimple()) return false; continue; } if (StoreInst *SI = dyn_cast(U)) { if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. continue; } if (SelectInst *SI = dyn_cast(U)) { // If the condition being selected on is a constant, fold the select, yes // this does (rarely) happen early on. if (ConstantInt *CI = dyn_cast(SI->getCondition())) { Value *Result = SI->getOperand(1+CI->isZero()); SI->replaceAllUsesWith(Result); SI->eraseFromParent(); // This is very rare and we just scrambled the use list of AI, start // over completely. return tryToMakeAllocaBePromotable(AI, DL); } // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. if (!isSafeSelectToSpeculate(SI)) return false; InstsToRewrite.insert(SI); continue; } if (PHINode *PN = dyn_cast(U)) { if (PN->use_empty()) { // Dead PHIs can be stripped. InstsToRewrite.insert(PN); continue; } // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. if (!isSafePHIToSpeculate(PN)) return false; InstsToRewrite.insert(PN); continue; } if (BitCastInst *BCI = dyn_cast(U)) { if (onlyUsedByLifetimeMarkers(BCI)) { InstsToRewrite.insert(BCI); continue; } } return false; } // If there are no instructions to rewrite, then all uses are load/stores and // we're done! if (InstsToRewrite.empty()) return true; // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { if (BitCastInst *BCI = dyn_cast(InstsToRewrite[i])) { // This could only be a bitcast used by nothing but lifetime intrinsics. for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end(); I != E;) cast(*I++)->eraseFromParent(); BCI->eraseFromParent(); continue; } if (SelectInst *SI = dyn_cast(InstsToRewrite[i])) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. while (!SI->use_empty()) { LoadInst *LI = cast(SI->user_back()); IRBuilder<> Builder(LI); LoadInst *TrueLoad = Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); LoadInst *FalseLoad = Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); // Transfer alignment and AA info if present. TrueLoad->setAlignment(LI->getAlignment()); FalseLoad->setAlignment(LI->getAlignment()); AAMDNodes Tags; LI->getAAMetadata(Tags); if (Tags) { TrueLoad->setAAMetadata(Tags); FalseLoad->setAAMetadata(Tags); } Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); V->takeName(LI); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } // Now that all the loads are gone, the select is gone too. SI->eraseFromParent(); continue; } // Otherwise, we have a PHI node which allows us to push the loads into the // predecessors. PHINode *PN = cast(InstsToRewrite[i]); if (PN->use_empty()) { PN->eraseFromParent(); continue; } - Type *LoadTy = cast(PN->getType())->getElementType(); + Type *LoadTy = AI->getAllocatedType(); PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); // Get the AA tags and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ, it doesn't matter. LoadInst *SomeLoad = cast(PN->user_back()); AAMDNodes AATags; SomeLoad->getAAMetadata(AATags); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { LoadInst *LI = cast(PN->user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } // Inject loads into all of the pred blocks. Keep track of which blocks we // insert them into in case we have multiple edges from the same block. DenseMap InsertedLoads; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; if (!Load) { Load = new LoadInst(PN->getIncomingValue(i), PN->getName() + "." + Pred->getName(), Pred->getTerminator()); Load->setAlignment(Align); if (AATags) Load->setAAMetadata(AATags); } NewPN->addIncoming(Load, Pred); } PN->eraseFromParent(); } ++NumAdjusted; return true; } bool SROA::performPromotion(Function &F) { std::vector Allocas; const DataLayout &DL = F.getParent()->getDataLayout(); DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis().getDomTree(); AssumptionCache &AC = getAnalysis().getAssumptionCache(F); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); bool Changed = false; SmallVector Insts; while (1) { Allocas.clear(); // Find allocas that are safe to promote, by looking at all instructions in // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? if (tryToMakeAllocaBePromotable(AI, DL)) Allocas.push_back(AI); if (Allocas.empty()) break; if (HasDomTree) PromoteMemToReg(Allocas, *DT, nullptr, &AC); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { AllocaInst *AI = Allocas[i]; // Build list of instructions to promote. for (User *U : AI->users()) Insts.push_back(cast(U)); AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); Insts.clear(); } } NumPromoted += Allocas.size(); Changed = true; } return Changed; } /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { Type *T = AI->getAllocatedType(); // Do not promote any struct that has too many members. if (StructType *ST = dyn_cast(T)) return ST->getNumElements() <= StructMemberThreshold; // Do not promote any array that has too many elements. if (ArrayType *AT = dyn_cast(T)) return AT->getNumElements() <= ArrayElementThreshold; return false; } // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the alloca instructions in the entry block, removing // them if they are only used by getelementptr instructions. // bool SROA::performScalarRepl(Function &F) { std::vector WorkList; const DataLayout &DL = F.getParent()->getDataLayout(); // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) if (AllocaInst *A = dyn_cast(I)) WorkList.push_back(A); // Process the worklist bool Changed = false; while (!WorkList.empty()) { AllocaInst *AI = WorkList.back(); WorkList.pop_back(); // Handle dead allocas trivially. These can be formed by SROA'ing arrays // with unused elements. if (AI->use_empty()) { AI->eraseFromParent(); Changed = true; continue; } // If this alloca is impossible for us to promote, reject it early. if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) continue; // Check to see if we can perform the core SROA transformation. We cannot // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; // Do not promote any struct whose size is too big. if (AllocaSize > SRThreshold) continue; // If the alloca looks like a good candidate for scalar replacement, and if // all its users can be transformed, then split up the aggregate into its // separate elements. if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) { DoScalarReplacement(AI, WorkList); Changed = true; continue; } // If we can turn this aggregate value (potentially with casts) into a // simple scalar value that can be mem2reg'd into a register value. // IsNotTrivial tracks whether this is something that mem2reg could have // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. if (AllocaInst *NewAI = ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold) .TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; Changed = true; continue; } // Otherwise, couldn't process this alloca. } return Changed; } /// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl /// predicate, do SROA now. void SROA::DoScalarReplacement(AllocaInst *AI, std::vector &WorkList) { DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); SmallVector ElementAllocas; if (StructType *ST = dyn_cast(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing } } else { ArrayType *AT = cast(AI->getAllocatedType()); ElementAllocas.reserve(AT->getNumElements()); Type *ElTy = AT->getElementType(); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing } } // Now that we have created the new alloca instructions, rewrite all the // uses of the old alloca. RewriteForScalarRepl(AI, AI, 0, ElementAllocas); // Now erase any instructions that were made dead while rewriting the alloca. DeleteDeadInstructions(); AI->eraseFromParent(); ++NumReplaced; } /// DeleteDeadInstructions - Erase instructions on the DeadInstrs list, /// recursively including all their operands that become trivially dead. void SROA::DeleteDeadInstructions() { while (!DeadInsts.empty()) { Instruction *I = cast(DeadInsts.pop_back_val()); for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) if (Instruction *U = dyn_cast(*OI)) { // Zero out the operand and see if it becomes trivially dead. // (But, don't add allocas to the dead instruction list -- they are // already on the worklist and will be deleted separately.) *OI = nullptr; if (isInstructionTriviallyDead(U) && !isa(U)) DeadInsts.push_back(U); } I->eraseFromParent(); } } /// isSafeForScalarRepl - Check if instruction I is a safe use with regard to /// performing scalar replacement of alloca AI. The results are flagged in /// the Info parameter. Offset indicates the position within AI that is /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { const DataLayout &DL = I->getModule()->getDataLayout(); for (Use &U : I->uses()) { Instruction *User = cast(U.getUser()); if (BitCastInst *BC = dyn_cast(User)) { isSafeForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast(User)) { uint64_t GEPOffset = Offset; isSafeGEP(GEPI, GEPOffset, Info); if (!Info.isUnsafe) isSafeForScalarRepl(GEPI, GEPOffset, Info); } else if (MemIntrinsic *MI = dyn_cast(User)) { ConstantInt *Length = dyn_cast(MI->getLength()); if (!Length || Length->isNegative()) return MarkUnsafe(Info, User); isSafeMemAccess(Offset, Length->getZExtValue(), nullptr, U.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast(User)) { if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) return MarkUnsafe(Info, User); } else if (isa(User) || isa(User)) { isSafePHISelectUseForScalarRepl(User, Offset, Info); } else { return MarkUnsafe(Info, User); } if (Info.isUnsafe) return; } } /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer /// derived from the alloca, we can often still split the alloca into elements. /// This is useful if we have a large alloca where one element is phi'd /// together somewhere: we can SRoA and promote all the other elements even if /// we end up not being able to promote this one. /// /// All we require is that the uses of the PHI do not index into other parts of /// the alloca. The most important use case for this is single load and stores /// that are PHI'd together, which can happen due to code sinking. void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { // If we've already checked this PHI, don't do it again. if (PHINode *PN = dyn_cast(I)) if (!Info.CheckedPHIs.insert(PN).second) return; const DataLayout &DL = I->getModule()->getDataLayout(); for (User *U : I->users()) { Instruction *UI = cast(U); if (BitCastInst *BC = dyn_cast(UI)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast(UI)) { // Only allow "bitcast" GEPs for simplicity. We could generalize this, // but would have to prove that we're staying inside of an element being // promoted. if (!GEPI->hasAllZeroIndices()) return MarkUnsafe(Info, UI); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); } else if (LoadInst *LI = dyn_cast(UI)) { if (!LI->isSimple()) return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast(UI)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (isa(UI) || isa(UI)) { isSafePHISelectUseForScalarRepl(UI, Offset, Info); } else { return MarkUnsafe(Info, UI); } if (Info.isUnsafe) return; } } /// isSafeGEP - Check if a GEP instruction can be handled for scalar /// replacement. It is safe when all the indices are constant, in-bounds /// references, and when the resulting offset corresponds to an element within /// the alloca type. The results are flagged in the Info parameter. Upon /// return, Offset is adjusted as specified by the GEP indices. void SROA::isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info) { gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); if (GEPIt == E) return; bool NonConstant = false; unsigned NonConstantIdxSize = 0; // Walk through the GEP type indices, checking the types that this indexes // into. for (; GEPIt != E; ++GEPIt) { // Ignore struct elements, no extra checking needed for these. if ((*GEPIt)->isStructTy()) continue; ConstantInt *IdxVal = dyn_cast(GEPIt.getOperand()); if (!IdxVal) return MarkUnsafe(Info, GEPI); } // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If this GEP is non-constant then the last operand must have been a // dynamic index into a vector. Pop this now as it has no impact on the // constant part of the offset. if (NonConstant) Indices.pop_back(); const DataLayout &DL = GEPI->getModule()->getDataLayout(); Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize, DL)) MarkUnsafe(Info, GEPI); } /// isHomogeneousAggregate - Check if type T is a struct or array containing /// elements of the same type (which is always true for arrays). If so, /// return true with NumElts and EltTy set to the number of elements and the /// element type, respectively. static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, Type *&EltTy) { if (ArrayType *AT = dyn_cast(T)) { NumElts = AT->getNumElements(); EltTy = (NumElts == 0 ? nullptr : AT->getElementType()); return true; } if (StructType *ST = dyn_cast(T)) { NumElts = ST->getNumContainedTypes(); EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0)); for (unsigned n = 1; n < NumElts; ++n) { if (ST->getContainedType(n) != EltTy) return false; } return true; } return false; } /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are /// "homogeneous" aggregates with the same element type and number of elements. static bool isCompatibleAggregate(Type *T1, Type *T2) { if (T1 == T2) return true; unsigned NumElts1, NumElts2; Type *EltTy1, *EltTy2; if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && isHomogeneousAggregate(T2, NumElts2, EltTy2) && NumElts1 == NumElts2 && EltTy1 == EltTy2) return true; return false; } /// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI /// alloca or has an offset and size that corresponds to a component element /// within it. The offset checked here may have been formed from a GEP with a /// pointer bitcasted to a different type. /// /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a /// unit. If false, it only allows accesses known to be in a single element. void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess) { const DataLayout &DL = TheAccess->getModule()->getDataLayout(); // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is // flagged as both a source and destination of such operations, we'll need // to check later for padding between elements. if (!MemOpType || MemOpType->isIntegerTy()) { if (isStore) Info.isMemCpyDst = true; else Info.isMemCpySrc = true; return; } // This is also safe for references using a type that is compatible with // the type of the alloca, so that loads/stores can be rewritten using // insertvalue/extractvalue. if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { Info.hasSubelementAccess = true; return; } } // Check if the offset/size correspond to a component within the alloca type. Type *T = Info.AI->getAllocatedType(); if (TypeHasComponent(T, Offset, MemSize, DL)) { Info.hasSubelementAccess = true; return; } return MarkUnsafe(Info, TheAccess); } /// TypeHasComponent - Return true if T has a component type with the /// specified offset and size. If Size is zero, do not check the size. bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, const DataLayout &DL) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast(T)) { const StructLayout *Layout = DL.getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); EltSize = DL.getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast(T)) { EltTy = AT->getElementType(); EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast(T)) { EltTy = VT->getElementType(); EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; } else { return false; } if (Offset == 0 && (Size == 0 || EltSize == Size)) return true; // Check if the component spans multiple elements. if (Offset + Size > EltSize) return false; return TypeHasComponent(EltTy, Offset, Size, DL); } /// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite /// the instruction I, which references it, to use the separate elements. /// Offset indicates the position within AI that is referenced by this /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { const DataLayout &DL = I->getModule()->getDataLayout(); for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = *UI++; Instruction *User = cast(TheUse.getUser()); if (BitCastInst *BC = dyn_cast(User)) { RewriteBitCast(BC, AI, Offset, NewElts); continue; } if (GetElementPtrInst *GEPI = dyn_cast(User)) { RewriteGEP(GEPI, AI, Offset, NewElts); continue; } if (MemIntrinsic *MI = dyn_cast(User)) { ConstantInt *Length = dyn_cast(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. continue; } if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); } continue; } if (LoadInst *LI = dyn_cast(User)) { Type *LIType = LI->getType(); if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: // %res = load { i32, i32 }* %alloc // with: // %load.0 = load i32* %alloc.0 // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 // %load.1 = load i32* %alloc.1 // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 // (Also works for arrays instead of structs) Value *Insert = UndefValue::get(LIType); IRBuilder<> Builder(LI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { Value *Load = Builder.CreateLoad(NewElts[i], "load"); Insert = Builder.CreateInsertValue(Insert, Load, i, "insert"); } LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && DL.getTypeAllocSize(LIType) == DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } continue; } if (StoreInst *SI = dyn_cast(User)) { Value *Val = SI->getOperand(0); Type *SIType = Val->getType(); if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { // Replace: // store { i32, i32 } %val, { i32, i32 }* %alloc // with: // %val.0 = extractvalue { i32, i32 } %val, 0 // store i32 %val.0, i32* %alloc.0 // %val.1 = extractvalue { i32, i32 } %val, 1 // store i32 %val.1, i32* %alloc.1 // (Also works for arrays instead of structs) IRBuilder<> Builder(SI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName()); Builder.CreateStore(Extract, NewElts[i]); } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && DL.getTypeAllocSize(SIType) == DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } continue; } if (isa(User) || isa(User)) { // If we have a PHI user of the alloca itself (as opposed to a GEP or // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to // the new pointer. if (!isa(I)) continue; assert(Offset == 0 && NewElts[0] && "Direct alloca use should have a zero offset"); // If we have a use of the alloca, we know the derived uses will be // utilizing just the first element of the scalarized result. Insert a // bitcast of the first alloca before the user as required. AllocaInst *NewAI = NewElts[0]; BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); NewAI->moveBefore(BCI); TheUse = BCI; continue; } } } /// RewriteBitCast - Update a bitcast reference to the alloca being replaced /// and recursively continue updating all of its uses. void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { RewriteForScalarRepl(BC, AI, Offset, NewElts); if (BC->getOperand(0) != AI) return; // The bitcast references the original alloca. Replace its uses with // references to the alloca containing offset zero (which is normally at // index zero, but might not be in cases involving structs with elements // of size zero). Type *T = AI->getAllocatedType(); uint64_t EltOffset = 0; Type *IdxTy; uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, BC->getModule()->getDataLayout()); Instruction *Val = NewElts[Idx]; if (Val->getType() != BC->getDestTy()) { Val = new BitCastInst(Val, BC->getDestTy(), "", BC); Val->takeName(BC); } BC->replaceAllUsesWith(Val); DeadInsts.push_back(BC); } /// FindElementAndOffset - Return the index of the element containing Offset /// within the specified type, which must be either a struct or an array. /// Sets T to the type of the element and Offset to the offset within that /// element. IdxTy is set to the type of the index result to be used in a /// GEP instruction. uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, const DataLayout &DL) { uint64_t Idx = 0; if (StructType *ST = dyn_cast(T)) { const StructLayout *Layout = DL.getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); IdxTy = Type::getInt32Ty(T->getContext()); return Idx; } else if (ArrayType *AT = dyn_cast(T)) { T = AT->getElementType(); uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); return Idx; } VectorType *VT = cast(T); T = VT->getElementType(); uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); return Idx; } /// RewriteGEP - Check if this GEP instruction moves the pointer across /// elements of the alloca that are being split apart, and if so, rewrite /// the GEP to be relative to the new element. void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { uint64_t OldOffset = Offset; const DataLayout &DL = GEPI->getModule()->getDataLayout(); SmallVector Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. // In this case, it must be the last GEP operand which is dynamic so keep that // aside until we've found the constant GEP offset then add it back in at the // end. Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); Type *T = AI->getAllocatedType(); Type *IdxTy; uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL); if (GEPI->getOperand(0) == AI) OldIdx = ~0ULL; // Force the GEP to be rewritten. T = AI->getAllocatedType(); uint64_t EltOffset = Offset; uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL); // If this GEP does not move the pointer across elements of the alloca // being split, then it does not needs to be rewritten. if (Idx == OldIdx) return; Type *i32Ty = Type::getInt32Ty(AI->getContext()); SmallVector NewArgs; NewArgs.push_back(Constant::getNullValue(i32Ty)); while (EltOffset != 0) { uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } if (NonConstantIdx) { Type* GepTy = T; // This GEP has a dynamic index. We need to add "i32 0" to index through // any structs or arrays in the original type until we get to the vector // to index. while (!isa(GepTy)) { NewArgs.push_back(Constant::getNullValue(i32Ty)); GepTy = cast(GepTy)->getTypeAtIndex(0U); } NewArgs.push_back(NonConstantIdx); } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); Val->takeName(GEPI); } if (Val->getType() != GEPI->getType()) Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI); GEPI->replaceAllUsesWith(Val); DeadInsts.push_back(GEPI); } /// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it /// to mark the lifetime of the scalarized memory. void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { ConstantInt *OldSize = cast(II->getArgOperand(0)); // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. Type *AIType = AI->getAllocatedType(); const DataLayout &DL = II->getModule()->getDataLayout(); uint64_t NewOffset = Offset; Type *IdxTy; uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL); IRBuilder<> Builder(II); uint64_t Size = OldSize->getLimitedValue(); if (NewOffset) { // Splice the first element and index 'NewOffset' bytes in. SROA will // split the alloca again later. unsigned AS = AI->getType()->getAddressSpace(); Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; } else { Size -= EltSize; } if (II->getIntrinsicID() == Intrinsic::lifetime_start) Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); else Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); ++Idx; } for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); uint64_t EltSize = DL.getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; } else { Size -= EltSize; } if (II->getIntrinsicID() == Intrinsic::lifetime_start) Builder.CreateLifetimeStart(NewElts[Idx], Builder.getInt64(EltSize)); else Builder.CreateLifetimeEnd(NewElts[Idx], Builder.getInt64(EltSize)); } DeadInsts.push_back(II); } /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVectorImpl &NewElts) { // If this is a memcpy/memmove, construct the other pointer as the // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. Value *OtherPtr = nullptr; unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) OtherPtr = MTI->getRawSource(); else { assert(Inst == MTI->getRawSource()); OtherPtr = MTI->getRawDest(); } } // If there is an other pointer, we want to convert it to the same pointer // type as AI has, so we can GEP through it safely. if (OtherPtr) { unsigned AddrSpace = cast(OtherPtr->getType())->getAddressSpace(); // Remove bitcasts and all-zero GEPs from OtherPtr. This is an // optimization, but it's also required to detect the corner case where // both pointer operands are referencing the same memory, and where // OtherPtr may be a bitcast or GEP that currently being rewritten. (This // function is only called for mem intrinsics that access the whole // aggregate, so non-zero GEPs are not an issue here.) OtherPtr = OtherPtr->stripPointerCasts(); // Copying the alloca to itself is a no-op: just delete it. if (OtherPtr == AI || OtherPtr == NewElts[0]) { // This code will run twice for a no-op memcpy -- once for each operand. // Put only one reference to MI on the DeadInsts list. for (SmallVectorImpl::const_iterator I = DeadInsts.begin(), E = DeadInsts.end(); I != E; ++I) if (*I == MI) return; DeadInsts.push_back(MI); return; } // If the pointer is not the right type, insert a bitcast to the right // type. - Type *NewTy = - PointerType::get(AI->getType()->getElementType(), AddrSpace); + Type *NewTy = PointerType::get(AI->getAllocatedType(), AddrSpace); if (OtherPtr->getType() != NewTy) OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); } // Process each element of the aggregate. bool SROADest = MI->getRawDest() == Inst; Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); const DataLayout &DL = MI->getModule()->getDataLayout(); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. Value *OtherElt = nullptr; unsigned OtherEltAlign = MemAlignment; if (OtherPtr) { Value *Idx[2] = { Zero, ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, OtherPtr->getName()+"."+Twine(i), MI); uint64_t EltOffset; - PointerType *OtherPtrTy = cast(OtherPtr->getType()); - Type *OtherTy = OtherPtrTy->getElementType(); + assert(AI->getType() == OtherPtr->getType()); + Type *OtherTy = AI->getAllocatedType(); if (StructType *ST = dyn_cast(OtherTy)) { EltOffset = DL.getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast(OtherTy)->getElementType(); EltOffset = DL.getTypeAllocSize(EltTy) * i; } // The alignment of the other pointer is the guaranteed alignment of the // element, which is affected by both the known alignment of the whole // mem intrinsic and the alignment of the element. If the alignment of // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the // known alignment is just 4 bytes. OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); } - Value *EltPtr = NewElts[i]; - Type *EltTy = cast(EltPtr->getType())->getElementType(); + AllocaInst *EltPtr = NewElts[i]; + Type *EltTy = EltPtr->getAllocatedType(); // If we got down to a scalar, insert a load or store as appropriate. if (EltTy->isSingleValueType()) { if (isa(MI)) { if (SROADest) { // From Other to Alloca. Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI); new StoreInst(Elt, EltPtr, MI); } else { // From Alloca to Other. Value *Elt = new LoadInst(EltPtr, "tmp", MI); new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI); } continue; } assert(isa(MI)); // If the stored element is zero (common case), just store a null // constant. Constant *StoreVal; if (ConstantInt *CI = dyn_cast(MI->getArgOperand(1))) { if (CI->isZero()) { StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> } else { // If EltTy is a vector type, get the element type. Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. unsigned EltSize = DL.getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. for (unsigned i = 0; 8*i < EltSize; ++i) { TotalVal = TotalVal.shl(8); TotalVal |= OneVal; } // Convert the integer value to the appropriate type. StoreVal = ConstantInt::get(CI->getContext(), TotalVal); if (ValTy->isPointerTy()) StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); else if (ValTy->isFloatingPointTy()) StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); assert(StoreVal->getType() == ValTy && "Type mismatch!"); // If the requested value was a vector constant, create it. if (EltTy->isVectorTy()) { unsigned NumElts = cast(EltTy)->getNumElements(); StoreVal = ConstantVector::getSplat(NumElts, StoreVal); } } new StoreInst(StoreVal, EltPtr, MI); continue; } // Otherwise, if we're storing a byte variable, use a memset call for // this element. } unsigned EltSize = DL.getTypeAllocSize(EltTy); if (!EltSize) continue; IRBuilder<> Builder(MI); // Finally, insert the meminst for this element. if (isa(MI)) { Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, MI->isVolatile()); } else { assert(isa(MI)); Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr if (isa(MI)) Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); else Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); } } DeadInsts.push_back(MI); } /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that /// overwrites the entire allocation. Extract out the pieces of the stored /// integer and store them individually. void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, SmallVectorImpl &NewElts) { // Extract each element out of the integer according to its structure offset // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); const DataLayout &DL = SI->getModule()->getDataLayout(); uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast(AllocaEltTy)) { const StructLayout *Layout = DL.getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); if (DL.isBigEndian()) Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } // Truncate down to an integer of the right size. uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; if (FieldSizeBits != AllocaSizeBits) EltVal = Builder.CreateTrunc(EltVal, IntegerType::get(SI->getContext(), FieldSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == FieldTy) { // Storing to an integer field of this size, just do it. } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). EltVal = Builder.CreateBitCast(EltVal, FieldTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). DestField = Builder.CreateBitCast(DestField, PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); } } else { ArrayType *ATy = cast(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy); uint64_t Shift; if (DL.isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Ignore zero sized fields like {}, they obviously contain no data. if (ElementSizeBits == 0) continue; Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } // Truncate down to an integer of the right size. if (ElementSizeBits != AllocaSizeBits) EltVal = Builder.CreateTrunc(EltVal, IntegerType::get(SI->getContext(), ElementSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == ArrayEltTy) { // Storing to an integer field of this size, just do it. } else if (ArrayEltTy->isFloatingPointTy() || ArrayEltTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). DestField = Builder.CreateBitCast(DestField, PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); if (DL.isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; } } DeadInsts.push_back(SI); } /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to /// an integer. Load the individual pieces to form the aggregate value. void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVectorImpl &NewElts) { // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); const DataLayout &DL = LI->getModule()->getDataLayout(); uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast(AllocaEltTy)) { Layout = DL.getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast(AllocaEltTy)->getElementType(); ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Load the value from the alloca. If the NewElt is an aggregate, cast // the pointer to an integer of the same size before doing the load. Value *SrcField = NewElts[i]; - Type *FieldTy = - cast(SrcField->getType())->getElementType(); + Type *FieldTy = NewElts[i]->getAllocatedType(); uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), FieldSizeBits); if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && !FieldTy->isVectorTy()) SrcField = new BitCastInst(SrcField, PointerType::getUnqual(FieldIntTy), "", LI); SrcField = new LoadInst(SrcField, "sroa.load.elt", LI); // If SrcField is a fp or vector of the right size but that isn't an // integer type, bitcast to an integer so we can shift it. if (SrcField->getType() != FieldIntTy) SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI); // Zero extend the field to be the same size as the final alloca so that // we can shift and insert it. if (SrcField->getType() != ResultVal->getType()) SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); // Determine the number of bits to shift SrcField. uint64_t Shift; if (Layout) // Struct case. Shift = Layout->getElementOffsetInBits(i); else // Array case. Shift = i*ArrayEltBitOffset; if (DL.isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); } // Don't create an 'or x, 0' on the first iteration. if (!isa(ResultVal) || !cast(ResultVal)->isNullValue()) ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI); else ResultVal = SrcField; } // Handle tail padding by truncating the result if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); DeadInsts.push_back(LI); } /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. static bool HasPadding(Type *Ty, const DataLayout &DL) { if (ArrayType *ATy = dyn_cast(Ty)) { Ty = ATy->getElementType(); return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty); } // SROA currently handles only Arrays and Structs. StructType *STy = cast(Ty); const StructLayout *SL = DL.getStructLayout(STy); unsigned PrevFieldBitOffset = 0; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned FieldBitOffset = SL->getElementOffsetInBits(i); // Check to see if there is any padding between this element and the // previous one. if (i) { unsigned PrevFieldEnd = PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1)); if (PrevFieldEnd < FieldBitOffset) return true; } PrevFieldBitOffset = FieldBitOffset; } // Check for tail padding. if (unsigned EltCount = STy->getNumElements()) { unsigned PrevFieldEnd = PrevFieldBitOffset + DL.getTypeSizeInBits(STy->getElementType(EltCount-1)); if (PrevFieldEnd < SL->getSizeInBits()) return true; } return false; } /// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of /// an aggregate can be broken down into elements. Return 0 if not, 3 if safe, /// or 1 if safe after canonicalization has been performed. bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // Loop over the use list of the alloca. We can only transform it if all of // the users are safe to transform. AllocaInfo Info(AI); isSafeForScalarRepl(AI, 0, Info); if (Info.isUnsafe) { DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); return false; } const DataLayout &DL = AI->getModule()->getDataLayout(); // Okay, we know all the users are promotable. If the aggregate is a memcpy // source and destination, we have to be careful. In particular, the memcpy // could be moving around elements that live in structure padding of the LLVM // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && HasPadding(AI->getAllocatedType(), DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed // via loads and stores, then we should use ConvertToScalarInfo to promote // the alloca instead of promoting each piece at a time and inserting fission // and fusion code. if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { // If the struct/array just has one element, use basic SRoA. if (StructType *ST = dyn_cast(AI->getAllocatedType())) { if (ST->getNumElements() > 1) return false; } else { if (cast(AI->getAllocatedType())->getNumElements() > 1) return false; } } return true; }