diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index bad745a992d3..5b2113a19eef 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -1,287 +1,290 @@ //===- Loads.cpp - Local load analysis ------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines simple local analyses for load instructions. // //===----------------------------------------------------------------------===// #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" using namespace llvm; /// \brief Test if A and B will obviously have the same value. /// /// This includes recognizing that %t0 and %t1 will have the same /// value in code like this: /// \code /// %t0 = getelementptr \@a, 0, 3 /// store i32 0, i32* %t0 /// %t1 = getelementptr \@a, 0, 3 /// %t2 = load i32* %t1 /// \endcode /// static bool AreEquivalentAddressValues(const Value *A, const Value *B) { // Test if the values are trivially equivalent. if (A == B) return true; // Test if the values come from identical arithmetic instructions. // Use isIdenticalToWhenDefined instead of isIdenticalTo because // this function is only used when one address use dominates the // other, which means that they'll always either have the same // value or one of them will have an undefined value. if (isa(A) || isa(A) || isa(A) || isa(A)) if (const Instruction *BI = dyn_cast(B)) if (cast(A)->isIdenticalToWhenDefined(BI)) return true; // Otherwise they may not be equivalent. return false; } /// \brief Check if executing a load of this pointer value cannot trap. /// /// If it is not obviously safe to load from the specified pointer, we do /// a quick local scan of the basic block containing \c ScanFrom, to determine /// if the address is already accessed. /// /// This uses the pointee type to determine how many bytes need to be safe to /// load from the pointer. bool llvm::isSafeToLoadUnconditionally(Value *V, unsigned Align, Instruction *ScanFrom) { const DataLayout &DL = ScanFrom->getModule()->getDataLayout(); // Zero alignment means that the load has the ABI alignment for the target if (Align == 0) Align = DL.getABITypeAlignment(V->getType()->getPointerElementType()); assert(isPowerOf2_32(Align)); + if (isDereferenceableAndAlignedPointer(V, Align, DL)) + return true; + int64_t ByteOffset = 0; Value *Base = V; Base = GetPointerBaseWithConstantOffset(V, ByteOffset, DL); if (ByteOffset < 0) // out of bounds return false; Type *BaseType = nullptr; unsigned BaseAlign = 0; if (const AllocaInst *AI = dyn_cast(Base)) { // An alloca is safe to load from as load as it is suitably aligned. BaseType = AI->getAllocatedType(); BaseAlign = AI->getAlignment(); } else if (const GlobalVariable *GV = dyn_cast(Base)) { // Global variables are not necessarily safe to load from if they are // overridden. Their size may change or they may be weak and require a test // to determine if they were in fact provided. if (!GV->mayBeOverridden()) { BaseType = GV->getType()->getElementType(); BaseAlign = GV->getAlignment(); } } PointerType *AddrTy = cast(V->getType()); uint64_t LoadSize = DL.getTypeStoreSize(AddrTy->getElementType()); // If we found a base allocated type from either an alloca or global variable, // try to see if we are definitively within the allocated region. We need to // know the size of the base type and the loaded type to do anything in this // case. if (BaseType && BaseType->isSized()) { if (BaseAlign == 0) BaseAlign = DL.getPrefTypeAlignment(BaseType); if (Align <= BaseAlign) { // Check if the load is within the bounds of the underlying object. if (ByteOffset + LoadSize <= DL.getTypeAllocSize(BaseType) && ((ByteOffset % Align) == 0)) return true; } } // Otherwise, be a little bit aggressive by scanning the local block where we // want to check to see if the pointer is already being loaded or stored // from/to. If so, the previous load or store would have already trapped, // so there is no harm doing an extra load (also, CSE will later eliminate // the load entirely). BasicBlock::iterator BBI = ScanFrom->getIterator(), E = ScanFrom->getParent()->begin(); // We can at least always strip pointer casts even though we can't use the // base here. V = V->stripPointerCasts(); while (BBI != E) { --BBI; // If we see a free or a call which may write to memory (i.e. which might do // a free) the pointer could be marked invalid. if (isa(BBI) && BBI->mayWriteToMemory() && !isa(BBI)) return false; Value *AccessedPtr; unsigned AccessedAlign; if (LoadInst *LI = dyn_cast(BBI)) { AccessedPtr = LI->getPointerOperand(); AccessedAlign = LI->getAlignment(); } else if (StoreInst *SI = dyn_cast(BBI)) { AccessedPtr = SI->getPointerOperand(); AccessedAlign = SI->getAlignment(); } else continue; Type *AccessedTy = AccessedPtr->getType()->getPointerElementType(); if (AccessedAlign == 0) AccessedAlign = DL.getABITypeAlignment(AccessedTy); if (AccessedAlign < Align) continue; // Handle trivial cases. if (AccessedPtr == V) return true; if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) && LoadSize <= DL.getTypeStoreSize(AccessedTy)) return true; } return false; } /// DefMaxInstsToScan - the default number of maximum instructions /// to scan in the block, used by FindAvailableLoadedValue(). /// FindAvailableLoadedValue() was introduced in r60148, to improve jump /// threading in part by eliminating partially redundant loads. /// At that point, the value of MaxInstsToScan was already set to '6' /// without documented explanation. cl::opt llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden, cl::desc("Use this to specify the default maximum number of instructions " "to scan backward from a given instruction, when searching for " "available loaded value")); /// \brief Scan the ScanBB block backwards to see if we have the value at the /// memory address *Ptr locally available within a small number of instructions. /// /// The scan starts from \c ScanFrom. \c MaxInstsToScan specifies the maximum /// instructions to scan in the block. If it is set to \c 0, it will scan the whole /// block. /// /// If the value is available, this function returns it. If not, it returns the /// iterator for the last validated instruction that the value would be live /// through. If we scanned the entire block and didn't find something that /// invalidates \c *Ptr or provides it, \c ScanFrom is left at the last /// instruction processed and this returns null. /// /// You can also optionally specify an alias analysis implementation, which /// makes this more precise. /// /// If \c AATags is non-null and a load or store is found, the AA tags from the /// load or store are recorded there. If there are no AA tags or if no access is /// found, it is left unmodified. Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan, AliasAnalysis *AA, AAMDNodes *AATags) { if (MaxInstsToScan == 0) MaxInstsToScan = ~0U; Type *AccessTy = cast(Ptr->getType())->getElementType(); const DataLayout &DL = ScanBB->getModule()->getDataLayout(); // Try to get the store size for the type. uint64_t AccessSize = DL.getTypeStoreSize(AccessTy); Value *StrippedPtr = Ptr->stripPointerCasts(); while (ScanFrom != ScanBB->begin()) { // We must ignore debug info directives when counting (otherwise they // would affect codegen). Instruction *Inst = &*--ScanFrom; if (isa(Inst)) continue; // Restore ScanFrom to expected value in case next test succeeds ScanFrom++; // Don't scan huge blocks. if (MaxInstsToScan-- == 0) return nullptr; --ScanFrom; // If this is a load of Ptr, the loaded value is available. // (This is true even if the load is volatile or atomic, although // those cases are unlikely.) if (LoadInst *LI = dyn_cast(Inst)) if (AreEquivalentAddressValues( LI->getPointerOperand()->stripPointerCasts(), StrippedPtr) && CastInst::isBitOrNoopPointerCastable(LI->getType(), AccessTy, DL)) { if (AATags) LI->getAAMetadata(*AATags); return LI; } if (StoreInst *SI = dyn_cast(Inst)) { Value *StorePtr = SI->getPointerOperand()->stripPointerCasts(); // If this is a store through Ptr, the value is available! // (This is true even if the store is volatile or atomic, although // those cases are unlikely.) if (AreEquivalentAddressValues(StorePtr, StrippedPtr) && CastInst::isBitOrNoopPointerCastable(SI->getValueOperand()->getType(), AccessTy, DL)) { if (AATags) SI->getAAMetadata(*AATags); return SI->getOperand(0); } // If both StrippedPtr and StorePtr reach all the way to an alloca or // global and they are different, ignore the store. This is a trivial form // of alias analysis that is important for reg2mem'd code. if ((isa(StrippedPtr) || isa(StrippedPtr)) && (isa(StorePtr) || isa(StorePtr)) && StrippedPtr != StorePtr) continue; // If we have alias analysis and it says the store won't modify the loaded // value, ignore the store. if (AA && (AA->getModRefInfo(SI, StrippedPtr, AccessSize) & MRI_Mod) == 0) continue; // Otherwise the store that may or may not alias the pointer, bail out. ++ScanFrom; return nullptr; } // If this is some other instruction that may clobber Ptr, bail out. if (Inst->mayWriteToMemory()) { // If alias analysis claims that it really won't modify the load, // ignore it. if (AA && (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) & MRI_Mod) == 0) continue; // May modify the pointer, bail out. ++ScanFrom; return nullptr; } } // Got to the start of the block, we didn't find it, but are done for this // block. return nullptr; } diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index bf3ec9616f1a..dbbc1758b9fe 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1,4294 +1,4289 @@ //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// \file /// This transformation implements the well known scalar replacement of /// aggregates transformation. It tries to identify promotable elements of an /// aggregate alloca, and promote them to registers. It will also try to /// convert uses of an element (or set of elements) of an alloca into a vector /// or bitfield-style integer scalar if appropriate. /// /// It works to do this with minimal slicing of the alloca so that regions /// which are merely transferred in and out of external memory remain unchanged /// and are not decomposed to scalar code. /// /// Because this also performs alloca promotion, it can be thought of as also /// serving the purpose of SSA formation. The algorithm iterates on the /// function until all opportunities for promotion have been realized. /// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/SROA.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #if __cplusplus >= 201103L && !defined(NDEBUG) // We only use this for a debug check in C++11 #include #endif using namespace llvm; using namespace llvm::sroa; #define DEBUG_TYPE "sroa" STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. static cl::opt SROARandomShuffleSlices("sroa-random-shuffle-slices", cl::init(false), cl::Hidden); /// Hidden option to experiment with completely strict handling of inbounds /// GEPs. static cl::opt SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), cl::Hidden); namespace { /// \brief A custom IRBuilder inserter which prefixes all names if they are /// preserved. template class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter { std::string Prefix; public: void SetNamePrefix(const Twine &P) { Prefix = P.str(); } protected: void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, BasicBlock::iterator InsertPt) const { IRBuilderDefaultInserter::InsertHelper( I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt); } }; // Specialization for not preserving the name is trivial. template <> class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter { public: void SetNamePrefix(const Twine &P) {} }; /// \brief Provide a typedef for IRBuilder that drops names in release builds. #ifndef NDEBUG typedef llvm::IRBuilder> IRBuilderTy; #else typedef llvm::IRBuilder> IRBuilderTy; #endif } namespace { /// \brief A used slice of an alloca. /// /// This structure represents a slice of an alloca used by some instruction. It /// stores both the begin and end offsets of this use, a pointer to the use /// itself, and a flag indicating whether we can classify the use as splittable /// or not when forming partitions of the alloca. class Slice { /// \brief The beginning offset of the range. uint64_t BeginOffset; /// \brief The ending offset, not included in the range. uint64_t EndOffset; /// \brief Storage for both the use of this slice and whether it can be /// split. PointerIntPair UseAndIsSplittable; public: Slice() : BeginOffset(), EndOffset() {} Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) : BeginOffset(BeginOffset), EndOffset(EndOffset), UseAndIsSplittable(U, IsSplittable) {} uint64_t beginOffset() const { return BeginOffset; } uint64_t endOffset() const { return EndOffset; } bool isSplittable() const { return UseAndIsSplittable.getInt(); } void makeUnsplittable() { UseAndIsSplittable.setInt(false); } Use *getUse() const { return UseAndIsSplittable.getPointer(); } bool isDead() const { return getUse() == nullptr; } void kill() { UseAndIsSplittable.setPointer(nullptr); } /// \brief Support for ordering ranges. /// /// This provides an ordering over ranges such that start offsets are /// always increasing, and within equal start offsets, the end offsets are /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. bool operator<(const Slice &RHS) const { if (beginOffset() < RHS.beginOffset()) return true; if (beginOffset() > RHS.beginOffset()) return false; if (isSplittable() != RHS.isSplittable()) return !isSplittable(); if (endOffset() > RHS.endOffset()) return true; return false; } /// \brief Support comparison with a single offset to allow binary searches. friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, uint64_t RHSOffset) { return LHS.beginOffset() < RHSOffset; } friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, const Slice &RHS) { return LHSOffset < RHS.beginOffset(); } bool operator==(const Slice &RHS) const { return isSplittable() == RHS.isSplittable() && beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); } bool operator!=(const Slice &RHS) const { return !operator==(RHS); } }; } // end anonymous namespace namespace llvm { template struct isPodLike; template <> struct isPodLike { static const bool value = true; }; } /// \brief Representation of the alloca slices. /// /// This class represents the slices of an alloca which are formed by its /// various uses. If a pointer escapes, we can't fully build a representation /// for the slices used and we reflect that in this structure. The uses are /// stored, sorted by increasing beginning offset and with unsplittable slices /// starting at a particular offset before splittable slices. class llvm::sroa::AllocaSlices { public: /// \brief Construct the slices of a particular alloca. AllocaSlices(const DataLayout &DL, AllocaInst &AI); /// \brief Test whether a pointer to the allocation escapes our analysis. /// /// If this is true, the slices are never fully built and should be /// ignored. bool isEscaped() const { return PointerEscapingInstr; } /// \brief Support for iterating over the slices. /// @{ typedef SmallVectorImpl::iterator iterator; typedef iterator_range range; iterator begin() { return Slices.begin(); } iterator end() { return Slices.end(); } typedef SmallVectorImpl::const_iterator const_iterator; typedef iterator_range const_range; const_iterator begin() const { return Slices.begin(); } const_iterator end() const { return Slices.end(); } /// @} /// \brief Erase a range of slices. void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } /// \brief Insert new slices for this alloca. /// /// This moves the slices into the alloca's slices collection, and re-sorts /// everything so that the usual ordering properties of the alloca's slices /// hold. void insert(ArrayRef NewSlices) { int OldSize = Slices.size(); Slices.append(NewSlices.begin(), NewSlices.end()); auto SliceI = Slices.begin() + OldSize; std::sort(SliceI, Slices.end()); std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } // Forward declare the iterator and range accessor for walking the // partitions. class partition_iterator; iterator_range partitions(); /// \brief Access the dead users for this alloca. ArrayRef getDeadUsers() const { return DeadUsers; } /// \brief Access the dead operands referring to this alloca. /// /// These are operands which have cannot actually be used to refer to the /// alloca as they are outside its range and the user doesn't correct for /// that. These mostly consist of PHI node inputs and the like which we just /// need to replace with undef. ArrayRef getDeadOperands() const { return DeadOperands; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; void printSlice(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; void printUse(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; void print(raw_ostream &OS) const; void dump(const_iterator I) const; void dump() const; #endif private: template class BuilderBase; class SliceBuilder; friend class AllocaSlices::SliceBuilder; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// \brief Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; #endif /// \brief The instruction responsible for this alloca not having a known set /// of slices. /// /// When an instruction (potentially) escapes the pointer to the alloca, we /// store a pointer to that here and abort trying to form slices of the /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; /// \brief The slices of the alloca. /// /// We store a vector of the slices formed by uses of the alloca here. This /// vector is sorted by increasing begin offset, and then the unsplittable /// slices before the splittable ones. See the Slice inner class for more /// details. SmallVector Slices; /// \brief Instructions which will become dead if we rewrite the alloca. /// /// Note that these are not separated by slice. This is because we expect an /// alloca to be completely rewritten or not rewritten at all. If rewritten, /// all these instructions can simply be removed and replaced with undef as /// they come from outside of the allocated space. SmallVector DeadUsers; /// \brief Operands which will become dead if we rewrite the alloca. /// /// These are operands that in their particular use can be replaced with /// undef when we rewrite the alloca. These show up in out-of-bounds inputs /// to PHI nodes and the like. They aren't entirely dead (there might be /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we /// want to swap this particular input for undef to simplify the use lists of /// the alloca. SmallVector DeadOperands; }; /// \brief A partition of the slices. /// /// An ephemeral representation for a range of slices which can be viewed as /// a partition of the alloca. This range represents a span of the alloca's /// memory which cannot be split, and provides access to all of the slices /// overlapping some part of the partition. /// /// Objects of this type are produced by traversing the alloca's slices, but /// are only ephemeral and not persistent. class llvm::sroa::Partition { private: friend class AllocaSlices; friend class AllocaSlices::partition_iterator; typedef AllocaSlices::iterator iterator; /// \brief The beginning and ending offsets of the alloca for this /// partition. uint64_t BeginOffset, EndOffset; /// \brief The start end end iterators of this partition. iterator SI, SJ; /// \brief A collection of split slice tails overlapping the partition. SmallVector SplitTails; /// \brief Raw constructor builds an empty partition starting and ending at /// the given iterator. Partition(iterator SI) : SI(SI), SJ(SI) {} public: /// \brief The start offset of this partition. /// /// All of the contained slices start at or after this offset. uint64_t beginOffset() const { return BeginOffset; } /// \brief The end offset of this partition. /// /// All of the contained slices end at or before this offset. uint64_t endOffset() const { return EndOffset; } /// \brief The size of the partition. /// /// Note that this can never be zero. uint64_t size() const { assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); return EndOffset - BeginOffset; } /// \brief Test whether this partition contains no slices, and merely spans /// a region occupied by split slices. bool empty() const { return SI == SJ; } /// \name Iterate slices that start within the partition. /// These may be splittable or unsplittable. They have a begin offset >= the /// partition begin offset. /// @{ // FIXME: We should probably define a "concat_iterator" helper and use that // to stitch together pointee_iterators over the split tails and the // contiguous iterators of the partition. That would give a much nicer // interface here. We could then additionally expose filtered iterators for // split, unsplit, and unsplittable splices based on the usage patterns. iterator begin() const { return SI; } iterator end() const { return SJ; } /// @} /// \brief Get the sequence of split slice tails. /// /// These tails are of slices which start before this partition but are /// split and overlap into the partition. We accumulate these while forming /// partitions. ArrayRef splitSliceTails() const { return SplitTails; } }; /// \brief An iterator over partitions of the alloca's slices. /// /// This iterator implements the core algorithm for partitioning the alloca's /// slices. It is a forward iterator as we don't support backtracking for /// efficiency reasons, and re-use a single storage area to maintain the /// current set of split slices. /// /// It is templated on the slice iterator type to use so that it can operate /// with either const or non-const slice iterators. class AllocaSlices::partition_iterator : public iterator_facade_base { friend class AllocaSlices; /// \brief Most of the state for walking the partitions is held in a class /// with a nice interface for examining them. Partition P; /// \brief We need to keep the end of the slices to know when to stop. AllocaSlices::iterator SE; /// \brief We also need to keep track of the maximum split end offset seen. /// FIXME: Do we really? uint64_t MaxSplitSliceEndOffset; /// \brief Sets the partition to be empty at given iterator, and sets the /// end iterator. partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { // If not already at the end, advance our state to form the initial // partition. if (SI != SE) advance(); } /// \brief Advance the iterator to the next partition. /// /// Requires that the iterator not be at the end of the slices. void advance() { assert((P.SI != SE || !P.SplitTails.empty()) && "Cannot advance past the end of the slices!"); // Clear out any split uses which have ended. if (!P.SplitTails.empty()) { if (P.EndOffset >= MaxSplitSliceEndOffset) { // If we've finished all splits, this is easy. P.SplitTails.clear(); MaxSplitSliceEndOffset = 0; } else { // Remove the uses which have ended in the prior partition. This // cannot change the max split slice end because we just checked that // the prior partition ended prior to that max. P.SplitTails.erase( std::remove_if( P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), P.SplitTails.end()); assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() == MaxSplitSliceEndOffset; }) && "Could not find the current max split slice offset!"); assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() <= MaxSplitSliceEndOffset; }) && "Max split slice end offset is not actually the max!"); } } // If P.SI is already at the end, then we've cleared the split tail and // now have an end iterator. if (P.SI == SE) { assert(P.SplitTails.empty() && "Failed to clear the split slices!"); return; } // If we had a non-empty partition previously, set up the state for // subsequent partitions. if (P.SI != P.SJ) { // Accumulate all the splittable slices which started in the old // partition into the split list. for (Slice &S : P) if (S.isSplittable() && S.endOffset() > P.EndOffset) { P.SplitTails.push_back(&S); MaxSplitSliceEndOffset = std::max(S.endOffset(), MaxSplitSliceEndOffset); } // Start from the end of the previous partition. P.SI = P.SJ; // If P.SI is now at the end, we at most have a tail of split slices. if (P.SI == SE) { P.BeginOffset = P.EndOffset; P.EndOffset = MaxSplitSliceEndOffset; return; } // If the we have split slices and the next slice is after a gap and is // not splittable immediately form an empty partition for the split // slices up until the next slice begins. if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && !P.SI->isSplittable()) { P.BeginOffset = P.EndOffset; P.EndOffset = P.SI->beginOffset(); return; } } // OK, we need to consume new slices. Set the end offset based on the // current slice, and step SJ past it. The beginning offset of the // partition is the beginning offset of the next slice unless we have // pre-existing split slices that are continuing, in which case we begin // at the prior end offset. P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; P.EndOffset = P.SI->endOffset(); ++P.SJ; // There are two strategies to form a partition based on whether the // partition starts with an unsplittable slice or a splittable slice. if (!P.SI->isSplittable()) { // When we're forming an unsplittable region, it must always start at // the first slice and will extend through its end. assert(P.BeginOffset == P.SI->beginOffset()); // Form a partition including all of the overlapping slices with this // unsplittable slice. while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { if (!P.SJ->isSplittable()) P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); ++P.SJ; } // We have a partition across a set of overlapping unsplittable // partitions. return; } // If we're starting with a splittable slice, then we need to form // a synthetic partition spanning it and any other overlapping splittable // splices. assert(P.SI->isSplittable() && "Forming a splittable partition!"); // Collect all of the overlapping splittable slices. while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && P.SJ->isSplittable()) { P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); ++P.SJ; } // Back upiP.EndOffset if we ended the span early when encountering an // unsplittable slice. This synthesizes the early end offset of // a partition spanning only splittable slices. if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { assert(!P.SJ->isSplittable()); P.EndOffset = P.SJ->beginOffset(); } } public: bool operator==(const partition_iterator &RHS) const { assert(SE == RHS.SE && "End iterators don't match between compared partition iterators!"); // The observed positions of partitions is marked by the P.SI iterator and // the emptiness of the split slices. The latter is only relevant when // P.SI == SE, as the end iterator will additionally have an empty split // slices list, but the prior may have the same P.SI and a tail of split // slices. if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { assert(P.SJ == RHS.P.SJ && "Same set of slices formed two different sized partitions!"); assert(P.SplitTails.size() == RHS.P.SplitTails.size() && "Same slice position with differently sized non-empty split " "slice tails!"); return true; } return false; } partition_iterator &operator++() { advance(); return *this; } Partition &operator*() { return P; } }; /// \brief A forward range over the partitions of the alloca's slices. /// /// This accesses an iterator range over the partitions of the alloca's /// slices. It computes these partitions on the fly based on the overlapping /// offsets of the slices and the ability to split them. It will visit "empty" /// partitions to cover regions of the alloca only accessed via split /// slices. iterator_range AllocaSlices::partitions() { return make_range(partition_iterator(begin(), end()), partition_iterator(end(), end())); } static Value *foldSelectInst(SelectInst &SI) { // If the condition being selected on is a constant or the same value is // being selected between, fold the select. Yes this does (rarely) happen // early on. if (ConstantInt *CI = dyn_cast(SI.getCondition())) return SI.getOperand(1 + CI->isZero()); if (SI.getOperand(1) == SI.getOperand(2)) return SI.getOperand(1); return nullptr; } /// \brief A helper that folds a PHI node or a select. static Value *foldPHINodeOrSelectInst(Instruction &I) { if (PHINode *PN = dyn_cast(&I)) { // If PN merges together the same value, return that value. return PN->hasConstantValue(); } return foldSelectInst(cast(I)); } /// \brief Builder for the alloca slices. /// /// This class builds a set of alloca slices by recursively visiting the uses /// of an alloca and making a slice for each load and store at each offset. class AllocaSlices::SliceBuilder : public PtrUseVisitor { friend class PtrUseVisitor; friend class InstVisitor; typedef PtrUseVisitor Base; const uint64_t AllocSize; AllocaSlices &AS; SmallDenseMap MemTransferSliceMap; SmallDenseMap PHIOrSelectSizes; /// \brief Set to de-duplicate dead instructions found in the use walk. SmallPtrSet VisitedDeadInsts; public: SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) : PtrUseVisitor(DL), AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {} private: void markAsDead(Instruction &I) { if (VisitedDeadInsts.insert(&I).second) AS.DeadUsers.push_back(&I); } void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or // past the end of the allocation. if (Size == 0 || Offset.uge(AllocSize)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" << " alloca: " << AS.AI << "\n" << " use: " << I << "\n"); return markAsDead(I); } uint64_t BeginOffset = Offset.getZExtValue(); uint64_t EndOffset = BeginOffset + Size; // Clamp the end offset to the end of the allocation. Note that this is // formulated to handle even the case where "BeginOffset + Size" overflows. // This may appear superficially to be something we could ignore entirely, // but that is not so! There may be widened loads or PHI-node uses where // some instructions are dead but not others. We can't completely ignore // them, and so have to record at least the information here. assert(AllocSize >= BeginOffset); // Established above. if (Size > AllocSize - BeginOffset) { DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset << " to remain within the " << AllocSize << " byte alloca:\n" << " alloca: " << AS.AI << "\n" << " use: " << I << "\n"); EndOffset = AllocSize; } AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); } void visitBitCastInst(BitCastInst &BC) { if (BC.use_empty()) return markAsDead(BC); return Base::visitBitCastInst(BC); } void visitGetElementPtrInst(GetElementPtrInst &GEPI) { if (GEPI.use_empty()) return markAsDead(GEPI); if (SROAStrictInbounds && GEPI.isInBounds()) { // FIXME: This is a manually un-factored variant of the basic code inside // of GEPs with checking of the inbounds invariant specified in the // langref in a very strict sense. If we ever want to enable // SROAStrictInbounds, this code should be factored cleanly into // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds // by writing out the code here where we have tho underlying allocation // size readily available. APInt GEPOffset = Offset; const DataLayout &DL = GEPI.getModule()->getDataLayout(); for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); GTI != GTE; ++GTI) { ConstantInt *OpC = dyn_cast(GTI.getOperand()); if (!OpC) break; // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast(*GTI)) { unsigned ElementIdx = OpC->getZExtValue(); const StructLayout *SL = DL.getStructLayout(STy); GEPOffset += APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); } else { // For array or vector indices, scale the index by the size of the // type. APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); GEPOffset += Index * APInt(Offset.getBitWidth(), DL.getTypeAllocSize(GTI.getIndexedType())); } // If this index has computed an intermediate pointer which is not // inbounds, then the result of the GEP is a poison value and we can // delete it and all uses. if (GEPOffset.ugt(AllocSize)) return markAsDead(GEPI); } } return Base::visitGetElementPtrInst(GEPI); } void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { // We allow splitting of non-volatile loads and stores where the type is an // integer type. These may be used to implement 'memcpy' or other "transfer // of bits" patterns. bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } void visitLoadInst(LoadInst &LI) { assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && "All simple FCA loads should have been pre-split"); if (!IsOffsetKnown) return PI.setAborted(&LI); const DataLayout &DL = LI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(LI.getType()); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } void visitStoreInst(StoreInst &SI) { Value *ValOp = SI.getValueOperand(); if (ValOp == *U) return PI.setEscapedAndAborted(&SI); if (!IsOffsetKnown) return PI.setAborted(&SI); const DataLayout &DL = SI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); // If this memory access can be shown to *statically* extend outside the // bounds of of the allocation, it's behavior is undefined, so simply // ignore it. Note that this is more strict than the generic clamping // behavior of insertUse. We also try to handle cases which might run the // risk of overflow. // FIXME: We should instead consider the pointer to have escaped if this // function is being instrumented for addressing bugs or race conditions. if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" << " alloca: " << AS.AI << "\n" << " use: " << SI << "\n"); return markAsDead(SI); } assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && "All simple FCA stores should have been pre-split"); handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); } void visitMemSetInst(MemSetInst &II) { assert(II.getRawDest() == *U && "Pointer use is not the destination?"); ConstantInt *Length = dyn_cast(II.getLength()); if ((Length && Length->getValue() == 0) || (IsOffsetKnown && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); if (!IsOffsetKnown) return PI.setAborted(&II); insertUse(II, Offset, Length ? Length->getLimitedValue() : AllocSize - Offset.getLimitedValue(), (bool)Length); } void visitMemTransferInst(MemTransferInst &II) { ConstantInt *Length = dyn_cast(II.getLength()); if (Length && Length->getValue() == 0) // Zero-length mem transfer intrinsics can be ignored entirely. return markAsDead(II); // Because we can visit these intrinsics twice, also check to see if the // first time marked this instruction as dead. If so, skip it. if (VisitedDeadInsts.count(&II)) return; if (!IsOffsetKnown) return PI.setAborted(&II); // This side of the transfer is completely out-of-bounds, and so we can // nuke the entire transfer. However, we also need to nuke the other side // if already added to our partitions. // FIXME: Yet another place we really should bypass this when // instrumenting for ASan. if (Offset.uge(AllocSize)) { SmallDenseMap::iterator MTPI = MemTransferSliceMap.find(&II); if (MTPI != MemTransferSliceMap.end()) AS.Slices[MTPI->second].kill(); return markAsDead(II); } uint64_t RawOffset = Offset.getLimitedValue(); uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; // Check for the special case where the same exact value is used for both // source and dest. if (*U == II.getRawDest() && *U == II.getRawSource()) { // For non-volatile transfers this is a no-op. if (!II.isVolatile()) return markAsDead(II); return insertUse(II, Offset, Size, /*IsSplittable=*/false); } // If we have seen both source and destination for a mem transfer, then // they both point to the same alloca. bool Inserted; SmallDenseMap::iterator MTPI; std::tie(MTPI, Inserted) = MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size())); unsigned PrevIdx = MTPI->second; if (!Inserted) { Slice &PrevP = AS.Slices[PrevIdx]; // Check if the begin offsets match and this is a non-volatile transfer. // In that case, we can completely elide the transfer. if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { PrevP.kill(); return markAsDead(II); } // Otherwise we have an offset transfer within the same alloca. We can't // split those. PrevP.makeUnsplittable(); } // Insert the use now that we've fixed up the splittable nature. insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); // Check that we ended up with a valid index in the map. assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && "Map index doesn't point back to a slice with this user."); } // Disable SRoA for any intrinsics except for lifetime invariants. // FIXME: What about debug intrinsics? This matches old behavior, but // doesn't make sense. void visitIntrinsicInst(IntrinsicInst &II) { if (!IsOffsetKnown) return PI.setAborted(&II); if (II.getIntrinsicID() == Intrinsic::lifetime_start || II.getIntrinsicID() == Intrinsic::lifetime_end) { ConstantInt *Length = cast(II.getArgOperand(0)); uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), Length->getLimitedValue()); insertUse(II, Offset, Size, true); return; } Base::visitIntrinsicInst(II); } Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { // We consider any PHI or select that results in a direct load or store of // the same offset to be a viable use for slicing purposes. These uses // are considered unsplittable and the size is the maximum loaded or stored // size. SmallPtrSet Visited; SmallVector, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast(*U), Root)); const DataLayout &DL = Root->getModule()->getDataLayout(); // If there are no loads or stores, the access is dead. We mark that as // a size zero access. Size = 0; do { Instruction *I, *UsedI; std::tie(UsedI, I) = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast(I)) { Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); continue; } if (StoreInst *SI = dyn_cast(I)) { Value *Op = SI->getOperand(0); if (Op == UsedI) return SI; Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); continue; } if (GetElementPtrInst *GEP = dyn_cast(I)) { if (!GEP->hasAllZeroIndices()) return GEP; } else if (!isa(I) && !isa(I) && !isa(I)) { return I; } for (User *U : I->users()) if (Visited.insert(cast(U)).second) Uses.push_back(std::make_pair(I, cast(U))); } while (!Uses.empty()); return nullptr; } void visitPHINodeOrSelectInst(Instruction &I) { assert(isa(I) || isa(I)); if (I.use_empty()) return markAsDead(I); // TODO: We could use SimplifyInstruction here to fold PHINodes and // SelectInsts. However, doing so requires to change the current // dead-operand-tracking mechanism. For instance, suppose neither loading // from %U nor %other traps. Then "load (select undef, %U, %other)" does not // trap either. However, if we simply replace %U with undef using the // current dead-operand-tracking mechanism, "load (select undef, undef, // %other)" may trap because the select may return the first operand // "undef". if (Value *Result = foldPHINodeOrSelectInst(I)) { if (Result == *U) // If the result of the constant fold will be the pointer, recurse // through the PHI/select as if we had RAUW'ed it. enqueueUsers(I); else // Otherwise the operand to the PHI/select is dead, and we can replace // it with undef. AS.DeadOperands.push_back(U); return; } if (!IsOffsetKnown) return PI.setAborted(&I); // See if we already have computed info on this node. uint64_t &Size = PHIOrSelectSizes[&I]; if (!Size) { // This is a new PHI/Select, check for an unsafe use of it. if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) return PI.setAborted(UnsafeI); } // For PHI and select operands outside the alloca, we can't nuke the entire // phi or select -- the other side might still be relevant, so we special // case them here and use a separate structure to track the operands // themselves which should be replaced with undef. // FIXME: This should instead be escaped in the event we're instrumenting // for address sanitization. if (Offset.uge(AllocSize)) { AS.DeadOperands.push_back(U); return; } insertUse(I, Offset, Size); } void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } /// \brief Disable SROA entirely if there are unhandled users of the alloca. void visitInstruction(Instruction &I) { PI.setAborted(&I); } }; AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif PointerEscapingInstr(nullptr) { SliceBuilder PB(DL, AI, *this); SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { // FIXME: We should sink the escape vs. abort info into the caller nicely, // possibly by just storing the PtrInfo in the AllocaSlices. PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() : PtrI.getAbortingInst(); assert(PointerEscapingInstr && "Did not track a bad instruction"); return; } Slices.erase(std::remove_if(Slices.begin(), Slices.end(), [](const Slice &S) { return S.isDead(); }), Slices.end()); #if __cplusplus >= 201103L && !defined(NDEBUG) if (SROARandomShuffleSlices) { std::mt19937 MT(static_cast(sys::TimeValue::now().msec())); std::shuffle(Slices.begin(), Slices.end(), MT); } #endif // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. std::sort(Slices.begin(), Slices.end()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); OS << "\n"; printUse(OS, I, Indent); } void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; } void AllocaSlices::print(raw_ostream &OS) const { if (PointerEscapingInstr) { OS << "Can't analyze slices for alloca: " << AI << "\n" << " A pointer to this alloca escaped by:\n" << " " << *PointerEscapingInstr << "\n"; return; } OS << "Slices of alloca: " << AI << "\n"; for (const_iterator I = begin(), E = end(); I != E; ++I) print(OS, I); } LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); } LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. static Type *findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, uint64_t EndOffset) { Type *Ty = nullptr; bool TyIsCommon = true; IntegerType *ITy = nullptr; // Note that we need to look at *every* alloca slice's Use to ensure we // always get consistent results regardless of the order of slices. for (AllocaSlices::const_iterator I = B; I != E; ++I) { Use *U = I->getUse(); if (isa(*U->getUser())) continue; if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) continue; Type *UserTy = nullptr; if (LoadInst *LI = dyn_cast(U->getUser())) { UserTy = LI->getType(); } else if (StoreInst *SI = dyn_cast(U->getUser())) { UserTy = SI->getValueOperand()->getType(); } if (IntegerType *UserITy = dyn_cast_or_null(UserTy)) { // If the type is larger than the partition, skip it. We only encounter // this for split integer operations where we want to use the type of the // entity causing the split. Also skip if the type is not a byte width // multiple. if (UserITy->getBitWidth() % 8 != 0 || UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) continue; // Track the largest bitwidth integer type used in this way in case there // is no common type. if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) ITy = UserITy; } // To avoid depending on the order of slices, Ty and TyIsCommon must not // depend on types skipped above. if (!UserTy || (Ty && Ty != UserTy)) TyIsCommon = false; // Give up on anything but an iN type. else Ty = UserTy; } return TyIsCommon ? Ty : ITy; } /// PHI instructions that use an alloca and are subsequently loaded can be /// rewritten to load both input pointers in the pred blocks and then PHI the /// results, allowing the load of the alloca to be promoted. /// From this: /// %P2 = phi [i32* %Alloca, i32* %Other] /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// ... /// %V2 = load i32* %Other /// ... /// %V = phi [i32 %V1, i32 %V2] /// /// We can do this to a select if its only uses are loads and if the operands /// to the select can be loaded unconditionally. /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h static bool isSafePHIToSpeculate(PHINode &PN) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN.getParent(); unsigned MaxAlign = 0; bool HaveLoad = false; for (User *U : PN.users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is // a common case that happens when instcombine merges two loads through // a PHI. if (LI->getParent() != BB) return false; // Ensure that there are no instructions between the PHI and the load that // could store. for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; MaxAlign = std::max(MaxAlign, LI->getAlignment()); HaveLoad = true; } if (!HaveLoad) return false; const DataLayout &DL = PN.getModule()->getDataLayout(); // We can only transform this if it is safe to push the loads into the // predecessor blocks. The only thing to watch out for is that we can't put // a possibly trapping load in the predecessor if it is a critical edge. for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); Value *InVal = PN.getIncomingValue(Idx); // If the value is produced by the terminator of the predecessor (an // invoke) or it has side-effects, there is no valid place to put a load // in the predecessor. if (TI == InVal || TI->mayHaveSideEffects()) return false; // If the predecessor has a single successor, then the edge isn't // critical. if (TI->getNumSuccessors() == 1) continue; // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (isDereferenceablePointer(InVal, DL) || - isSafeToLoadUnconditionally(InVal, MaxAlign, TI)) + if (isSafeToLoadUnconditionally(InVal, MaxAlign, TI)) continue; return false; } return true; } static void speculatePHINodeLoads(PHINode &PN) { DEBUG(dbgs() << " original: " << PN << "\n"); Type *LoadTy = cast(PN.getType())->getElementType(); IRBuilderTy PHIBuilder(&PN); PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), PN.getName() + ".sroa.speculated"); // Get the AA tags and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ. LoadInst *SomeLoad = cast(PN.user_back()); AAMDNodes AATags; SomeLoad->getAAMetadata(AATags); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN.use_empty()) { LoadInst *LI = cast(PN.user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } // Inject loads into all of the pred blocks. for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { BasicBlock *Pred = PN.getIncomingBlock(Idx); TerminatorInst *TI = Pred->getTerminator(); Value *InVal = PN.getIncomingValue(Idx); IRBuilderTy PredBuilder(TI); LoadInst *Load = PredBuilder.CreateLoad( InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); ++NumLoadsSpeculated; Load->setAlignment(Align); if (AATags) Load->setAAMetadata(AATags); NewPN->addIncoming(Load, Pred); } DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); PN.eraseFromParent(); } /// Select instructions that use an alloca and are subsequently loaded can be /// rewritten to load both input pointers and then select between the result, /// allowing the load of the alloca to be promoted. /// From this: /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// %V2 = load i32* %Other /// %V = select i1 %cond, i32 %V1, i32 %V2 /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); const DataLayout &DL = SI.getModule()->getDataLayout(); - bool TDerefable = isDereferenceablePointer(TValue, DL); - bool FDerefable = isDereferenceablePointer(FValue, DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. - if (!TDerefable && - !isSafeToLoadUnconditionally(TValue, LI->getAlignment(), LI)) + if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), LI)) return false; - if (!FDerefable && - !isSafeToLoadUnconditionally(FValue, LI->getAlignment(), LI)) + if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), LI)) return false; } return true; } static void speculateSelectInstLoads(SelectInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); IRBuilderTy IRB(&SI); Value *TV = SI.getTrueValue(); Value *FV = SI.getFalseValue(); // Replace the loads of the select with a select of two loads. while (!SI.use_empty()) { LoadInst *LI = cast(SI.user_back()); assert(LI->isSimple() && "We only speculate simple loads"); IRB.SetInsertPoint(LI); LoadInst *TL = IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); LoadInst *FL = IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); NumLoadsSpeculated += 2; // Transfer alignment and AA info if present. TL->setAlignment(LI->getAlignment()); FL->setAlignment(LI->getAlignment()); AAMDNodes Tags; LI->getAAMetadata(Tags); if (Tags) { TL->setAAMetadata(Tags); FL->setAAMetadata(Tags); } Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, LI->getName() + ".sroa.speculated"); DEBUG(dbgs() << " speculated to: " << *V << "\n"); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } SI.eraseFromParent(); } /// \brief Build a GEP out of a base pointer and indices. /// /// This will return the BasePtr if that is valid, or build a new GEP /// instruction using the IRBuilder if GEP-ing is needed. static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, SmallVectorImpl &Indices, Twine NamePrefix) { if (Indices.empty()) return BasePtr; // A single zero index is a no-op, so check for this and avoid building a GEP // in that case. if (Indices.size() == 1 && cast(Indices.back())->isZero()) return BasePtr; return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices, NamePrefix + "sroa_idx"); } /// \brief Get a natural GEP off of the BasePtr walking through Ty toward /// TargetTy without changing the offset of the pointer. /// /// This routine assumes we've already established a properly offset GEP with /// Indices, and arrived at the Ty type. The goal is to continue to GEP with /// zero-indices down through type layers until we find one the same as /// TargetTy. If we can't find one with the same type, we at least try to use /// one with the same size. If none of that works, we just produce the GEP as /// indicated by Indices to have the correct offset. static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl &Indices, Twine NamePrefix) { if (Ty == TargetTy) return buildGEP(IRB, BasePtr, Indices, NamePrefix); // Pointer size to use for the indices. unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType()); // See if we can descend into a struct and locate a field with the correct // type. unsigned NumLayers = 0; Type *ElementTy = Ty; do { if (ElementTy->isPointerTy()) break; if (ArrayType *ArrayTy = dyn_cast(ElementTy)) { ElementTy = ArrayTy->getElementType(); Indices.push_back(IRB.getIntN(PtrSize, 0)); } else if (VectorType *VectorTy = dyn_cast(ElementTy)) { ElementTy = VectorTy->getElementType(); Indices.push_back(IRB.getInt32(0)); } else if (StructType *STy = dyn_cast(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. ElementTy = *STy->element_begin(); Indices.push_back(IRB.getInt32(0)); } else { break; } ++NumLayers; } while (ElementTy != TargetTy); if (ElementTy != TargetTy) Indices.erase(Indices.end() - NumLayers, Indices.end()); return buildGEP(IRB, BasePtr, Indices, NamePrefix); } /// \brief Recursively compute indices for a natural GEP. /// /// This is the recursive step for getNaturalGEPWithOffset that walks down the /// element types adding appropriate indices for the GEP. static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, SmallVectorImpl &Indices, Twine NamePrefix) { if (Offset == 0) return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); // We can't recurse through pointer types. if (Ty->isPointerTy()) return nullptr; // We try to analyze GEPs over vectors here, but note that these GEPs are // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast(Ty)) { unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8 != 0) { // GEPs over non-multiple of 8 size vector elements are invalid. return nullptr; } APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(VecTy->getNumElements())) return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), Offset, TargetTy, Indices, NamePrefix); } if (ArrayType *ArrTy = dyn_cast(Ty)) { Type *ElementTy = ArrTy->getElementType(); APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) return nullptr; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices, NamePrefix); } StructType *STy = dyn_cast(Ty); if (!STy) return nullptr; const StructLayout *SL = DL.getStructLayout(STy); uint64_t StructOffset = Offset.getZExtValue(); if (StructOffset >= SL->getSizeInBytes()) return nullptr; unsigned Index = SL->getElementContainingOffset(StructOffset); Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); Type *ElementTy = STy->getElementType(Index); if (Offset.uge(DL.getTypeAllocSize(ElementTy))) return nullptr; // The offset points into alignment padding. Indices.push_back(IRB.getInt32(Index)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices, NamePrefix); } /// \brief Get a natural GEP from a base pointer to a particular offset and /// resulting in a particular type. /// /// The goal is to produce a "natural" looking GEP that works with the existing /// composite types to arrive at the appropriate offset and element type for /// a pointer. TargetTy is the element type the returned GEP should point-to if /// possible. We recurse by decreasing Offset, adding the appropriate index to /// Indices, and setting Ty to the result subtype. /// /// If no natural GEP can be constructed, this function returns null. static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *TargetTy, SmallVectorImpl &Indices, Twine NamePrefix) { PointerType *Ty = cast(Ptr->getType()); // Don't consider any GEPs through an i8* as natural unless the TargetTy is // an i8. if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) return nullptr; Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) return nullptr; // We can't GEP through an unsized element. APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return nullptr; // Zero-length arrays can't help us build a natural GEP. APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices, NamePrefix); } /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the /// resulting pointer has PointerTy. /// /// This tries very hard to compute a "natural" GEP which arrives at the offset /// and produces the pointer type desired. Where it cannot, it will try to use /// the natural GEP to arrive at the offset and bitcast to the type. Where that /// fails, it will try to use an existing i8* and GEP to the byte offset and /// bitcast to the type. /// /// The strategy for finding the more natural GEPs is to peel off layers of the /// pointer, walking back through bit casts and GEPs, searching for a base /// pointer from which we can compute a natural GEP with the desired /// properties. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy, Twine NamePrefix) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. SmallPtrSet Visited; Visited.insert(Ptr); SmallVector Indices; // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll // fall back to it, so keep it and the base it was computed from around here. Value *OffsetPtr = nullptr; Value *OffsetBasePtr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. Value *Int8Ptr = nullptr; APInt Int8PtrOffset(Offset.getBitWidth(), 0); Type *TargetTy = PointerTy->getPointerElementType(); do { // First fold any existing GEPs into the offset. while (GEPOperator *GEP = dyn_cast(Ptr)) { APInt GEPOffset(Offset.getBitWidth(), 0); if (!GEP->accumulateConstantOffset(DL, GEPOffset)) break; Offset += GEPOffset; Ptr = GEP->getPointerOperand(); if (!Visited.insert(Ptr).second) break; } // See if we can perform a natural GEP here. Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices, NamePrefix)) { // If we have a new natural pointer at the offset, clear out any old // offset pointer we computed. Unless it is the base pointer or // a non-instruction, we built a GEP we don't need. Zap it. if (OffsetPtr && OffsetPtr != OffsetBasePtr) if (Instruction *I = dyn_cast(OffsetPtr)) { assert(I->use_empty() && "Built a GEP with uses some how!"); I->eraseFromParent(); } OffsetPtr = P; OffsetBasePtr = Ptr; // If we also found a pointer of the right type, we're done. if (P->getType() == PointerTy) return P; } // Stash this pointer if we've found an i8*. if (Ptr->getType()->isIntegerTy(8)) { Int8Ptr = Ptr; Int8PtrOffset = Offset; } // Peel off a layer of the pointer and update the offset appropriately. if (Operator::getOpcode(Ptr) == Instruction::BitCast) { Ptr = cast(Ptr)->getOperand(0); } else if (GlobalAlias *GA = dyn_cast(Ptr)) { if (GA->mayBeOverridden()) break; Ptr = GA->getAliasee(); } else { break; } assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); } while (Visited.insert(Ptr).second); if (!OffsetPtr) { if (!Int8Ptr) { Int8Ptr = IRB.CreateBitCast( Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), NamePrefix + "sroa_raw_cast"); Int8PtrOffset = Offset; } OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr, IRB.getInt(Int8PtrOffset), NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; // On the off chance we were targeting i8*, guard the bitcast here. if (Ptr->getType() != PointerTy) Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast"); return Ptr; } /// \brief Compute the adjusted alignment for a load or store from an offset. static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, const DataLayout &DL) { unsigned Alignment; Type *Ty; if (auto *LI = dyn_cast(I)) { Alignment = LI->getAlignment(); Ty = LI->getType(); } else if (auto *SI = dyn_cast(I)) { Alignment = SI->getAlignment(); Ty = SI->getValueOperand()->getType(); } else { llvm_unreachable("Only loads and stores are allowed!"); } if (!Alignment) Alignment = DL.getABITypeAlignment(Ty); return MinAlign(Alignment, Offset); } /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (OldTy == NewTy) return true; // For integer types, we can't handle any bit-width differences. This would // break both vector conversions with extension and introduce endianness // issues when in conjunction with loads and stores. if (isa(OldTy) && isa(NewTy)) { assert(cast(OldTy)->getBitWidth() != cast(NewTy)->getBitWidth() && "We can't have the same bitwidth for different int types"); return false; } if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; // We can convert pointers to integers and vice-versa. Same for vectors // of pointers and integers. OldTy = OldTy->getScalarType(); NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) return true; if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) return true; return false; } return true; } /// \brief Generic routine to convert an SSA value to a value of a different /// type. /// /// This will try various different casting techniques, such as bitcasts, /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test /// two types for viability with this routine. static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); if (OldTy == NewTy) return V; assert(!(isa(OldTy) && isa(NewTy)) && "Integer types must be the exact same to convert."); // See if we need inttoptr for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. if (OldTy->getScalarType()->isIntegerTy() && NewTy->getScalarType()->isPointerTy()) { // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* if (OldTy->isVectorTy() && !NewTy->isVectorTy()) return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), NewTy); // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> if (!OldTy->isVectorTy() && NewTy->isVectorTy()) return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), NewTy); return IRB.CreateIntToPtr(V, NewTy); } // See if we need ptrtoint for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. if (OldTy->getScalarType()->isPointerTy() && NewTy->getScalarType()->isIntegerTy()) { // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> if (!OldTy->isVectorTy() && NewTy->isVectorTy()) return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); return IRB.CreatePtrToInt(V, NewTy); } return IRB.CreateBitCast(V, NewTy); } /// \brief Test whether the given slice use can be promoted to a vector. /// /// This function is called to test each entry in a partition which is slated /// for a single slice. static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, const DataLayout &DL) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); uint64_t BeginIndex = BeginOffset / ElementSize; if (BeginIndex * ElementSize != BeginOffset || BeginIndex >= Ty->getNumElements()) return false; uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); uint64_t EndIndex = EndOffset / ElementSize; if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) return false; assert(EndIndex > BeginIndex && "Empty vector!"); uint64_t NumElements = EndIndex - BeginIndex; Type *SliceTy = (NumElements == 1) ? Ty->getElementType() : VectorType::get(Ty->getElementType(), NumElements); Type *SplitIntTy = Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); Use *U = S.getUse(); if (MemIntrinsic *MI = dyn_cast(U->getUser())) { if (MI->isVolatile()) return false; if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) return false; } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { // Disable vector promotion when there are loads or stores of an FCA. return false; } else if (LoadInst *LI = dyn_cast(U->getUser())) { if (LI->isVolatile()) return false; Type *LTy = LI->getType(); if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(LTy->isIntegerTy()); LTy = SplitIntTy; } if (!canConvertValue(DL, SliceTy, LTy)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) return false; Type *STy = SI->getValueOperand()->getType(); if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { assert(STy->isIntegerTy()); STy = SplitIntTy; } if (!canConvertValue(DL, STy, SliceTy)) return false; } else { return false; } return true; } /// \brief Test whether the given alloca partitioning and range of slices can be /// promoted to a vector. /// /// This is a quick test to check whether we can rewrite a particular alloca /// partition (and its newly formed alloca) into a vector alloca with only /// whole-vector loads and stores such that it could be promoted to a vector /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; Type *CommonEltTy = nullptr; bool HaveCommonEltTy = true; auto CheckCandidateType = [&](Type *Ty) { if (auto *VTy = dyn_cast(Ty)) { CandidateTys.push_back(VTy); if (!CommonEltTy) CommonEltTy = VTy->getElementType(); else if (CommonEltTy != VTy->getElementType()) HaveCommonEltTy = false; } }; // Consider any loads or stores that are the exact size of the slice. for (const Slice &S : P) if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset()) { if (auto *LI = dyn_cast(S.getUse()->getUser())) CheckCandidateType(LI->getType()); else if (auto *SI = dyn_cast(S.getUse()->getUser())) CheckCandidateType(SI->getValueOperand()->getType()); } // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; // Remove non-integer vector types if we had multiple common element types. // FIXME: It'd be nice to replace them with integer vector types, but we can't // do that until all the backends are known to produce good code for all // integer vector types. if (!HaveCommonEltTy) { CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(), [](VectorType *VTy) { return !VTy->getElementType()->isIntegerTy(); }), CandidateTys.end()); // If there were no integer vector types, give up. if (CandidateTys.empty()) return nullptr; // Rank the remaining candidate vector types. This is easy because we know // they're all integer vectors. We sort by ascending number of elements. auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) && "Cannot have vector types of different sizes!"); assert(RHSTy->getElementType()->isIntegerTy() && "All non-integer types eliminated!"); assert(LHSTy->getElementType()->isIntegerTy() && "All non-integer types eliminated!"); return RHSTy->getNumElements() < LHSTy->getNumElements(); }; std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes); CandidateTys.erase( std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), CandidateTys.end()); } else { // The only way to have the same element type in every vector type is to // have the same vector type. Check that and remove all but one. #ifndef NDEBUG for (VectorType *VTy : CandidateTys) { assert(VTy->getElementType() == CommonEltTy && "Unaccounted for element type!"); assert(VTy == CandidateTys[0] && "Different vector types with the same element type!"); } #endif CandidateTys.resize(1); } // Try each vector type, and return the one which works. auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()); // While the definition of LLVM vectors is bitpacked, we don't support sizes // that aren't byte sized. if (ElementSize % 8) return false; assert((DL.getTypeSizeInBits(VTy) % 8) == 0 && "vector size not a multiple of element size?"); ElementSize /= 8; for (const Slice &S : P) if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) return false; for (const Slice *S : P.splitSliceTails()) if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) return false; return true; }; for (VectorType *VTy : CandidateTys) if (CheckVectorTypeForPromotion(VTy)) return VTy; return nullptr; } /// \brief Test whether a slice of an alloca is valid for integer widening. /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, const DataLayout &DL, bool &WholeAllocaOp) { uint64_t Size = DL.getTypeStoreSize(AllocaTy); uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; Use *U = S.getUse(); if (LoadInst *LI = dyn_cast(U->getUser())) { if (LI->isVolatile()) return false; // We can't handle loads that extend past the allocated memory. if (DL.getTypeStoreSize(LI->getType()) > Size) return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. if (!isa(LI->getType()) && RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; if (IntegerType *ITy = dyn_cast(LI->getType())) { if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; } else if (RelBegin != 0 || RelEnd != Size || !canConvertValue(DL, AllocaTy, LI->getType())) { // Non-integer loads need to be convertible from the alloca type so that // they are promotable. return false; } } else if (StoreInst *SI = dyn_cast(U->getUser())) { Type *ValueTy = SI->getValueOperand()->getType(); if (SI->isVolatile()) return false; // We can't handle stores that extend past the allocated memory. if (DL.getTypeStoreSize(ValueTy) > Size) return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. if (!isa(ValueTy) && RelBegin == 0 && RelEnd == Size) WholeAllocaOp = true; if (IntegerType *ITy = dyn_cast(ValueTy)) { if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; } else if (RelBegin != 0 || RelEnd != Size || !canConvertValue(DL, ValueTy, AllocaTy)) { // Non-integer stores need to be convertible to the alloca type so that // they are promotable. return false; } } else if (MemIntrinsic *MI = dyn_cast(U->getUser())) { if (MI->isVolatile() || !isa(MI->getLength())) return false; if (!S.isSplittable()) return false; // Skip any unsplittable intrinsics. } else if (IntrinsicInst *II = dyn_cast(U->getUser())) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) return false; } else { return false; } return true; } /// \brief Test whether the given alloca partition's integer operations can be /// widened to promotable ones. /// /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) return false; // Don't try to handle allocas with bit-padding. if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy)) return false; // We need to ensure that an integer type with the appropriate bitwidth can // be converted to the alloca type, whatever that is. We don't want to force // the alloca itself to have an integer type if there is a more suitable one. Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); if (!canConvertValue(DL, AllocaTy, IntTy) || !canConvertValue(DL, IntTy, AllocaTy)) return false; // While examining uses, we ensure that the alloca has a covering load or // store. We don't want to widen the integer operations only to fail to // promote due to some other unsplittable entry (which we may make splittable // later). However, if there are only splittable uses, go ahead and assume // that we cover the alloca. // FIXME: We shouldn't consider split slices that happen to start in the // partition here... bool WholeAllocaOp = P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits); for (const Slice &S : P) if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, WholeAllocaOp)) return false; for (const Slice *S : P.splitSliceTails()) if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, WholeAllocaOp)) return false; return WholeAllocaOp; } static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, IntegerType *Ty, uint64_t Offset, const Twine &Name) { DEBUG(dbgs() << " start: " << *V << "\n"); IntegerType *IntTy = cast(V->getType()); assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element extends past full value"); uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); } assert(Ty->getBitWidth() <= IntTy->getBitWidth() && "Cannot extract to a larger integer!"); if (Ty != IntTy) { V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); DEBUG(dbgs() << " trunced: " << *V << "\n"); } return V; } static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, Value *V, uint64_t Offset, const Twine &Name) { IntegerType *IntTy = cast(Old->getType()); IntegerType *Ty = cast(V->getType()); assert(Ty->getBitWidth() <= IntTy->getBitWidth() && "Cannot insert a larger integer!"); DEBUG(dbgs() << " start: " << *V << "\n"); if (Ty != IntTy) { V = IRB.CreateZExt(V, IntTy, Name + ".ext"); DEBUG(dbgs() << " extended: " << *V << "\n"); } assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && "Element store outside of alloca store"); uint64_t ShAmt = 8 * Offset; if (DL.isBigEndian()) ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); if (ShAmt) { V = IRB.CreateShl(V, ShAmt, Name + ".shift"); DEBUG(dbgs() << " shifted: " << *V << "\n"); } if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); DEBUG(dbgs() << " masked: " << *Old << "\n"); V = IRB.CreateOr(Old, V, Name + ".insert"); DEBUG(dbgs() << " inserted: " << *V << "\n"); } return V; } static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, unsigned EndIndex, const Twine &Name) { VectorType *VecTy = cast(V->getType()); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); if (NumElements == VecTy->getNumElements()) return V; if (NumElements == 1) { V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), Name + ".extract"); DEBUG(dbgs() << " extract: " << *V << "\n"); return V; } SmallVector Mask; Mask.reserve(NumElements); for (unsigned i = BeginIndex; i != EndIndex; ++i) Mask.push_back(IRB.getInt32(i)); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), ConstantVector::get(Mask), Name + ".extract"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); return V; } static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, unsigned BeginIndex, const Twine &Name) { VectorType *VecTy = cast(Old->getType()); assert(VecTy && "Can only insert a vector into a vector"); VectorType *Ty = dyn_cast(V->getType()); if (!Ty) { // Single element to insert. V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), Name + ".insert"); DEBUG(dbgs() << " insert: " << *V << "\n"); return V; } assert(Ty->getNumElements() <= VecTy->getNumElements() && "Too many elements!"); if (Ty->getNumElements() == VecTy->getNumElements()) { assert(V->getType() == VecTy && "Vector type mismatch"); return V; } unsigned EndIndex = BeginIndex + Ty->getNumElements(); // When inserting a smaller vector into the larger to store, we first // use a shuffle vector to widen it with undef elements, and then // a second shuffle vector to select between the loaded vector and the // incoming vector. SmallVector Mask; Mask.reserve(VecTy->getNumElements()); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) if (i >= BeginIndex && i < EndIndex) Mask.push_back(IRB.getInt32(i - BeginIndex)); else Mask.push_back(UndefValue::get(IRB.getInt32Ty())); V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), ConstantVector::get(Mask), Name + ".expand"); DEBUG(dbgs() << " shuffle: " << *V << "\n"); Mask.clear(); for (unsigned i = 0; i != VecTy->getNumElements(); ++i) Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex)); V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend"); DEBUG(dbgs() << " blend: " << *V << "\n"); return V; } /// \brief Visitor to rewrite instructions using p particular slice of an alloca /// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. class llvm::sroa::AllocaSliceRewriter : public InstVisitor { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor; typedef llvm::InstVisitor Base; const DataLayout &DL; AllocaSlices &AS; SROA &Pass; AllocaInst &OldAI, &NewAI; const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; Type *NewAllocaTy; // This is a convenience and flag variable that will be null unless the new // alloca's integer operations should be widened to this integer type due to // passing isIntegerWideningViable above. If it is non-null, the desired // integer type will be stored here for easy access during rewriting. IntegerType *IntTy; // If we are rewriting an alloca partition which can be written as pure // vector operations, we stash extra information here. When VecTy is // non-null, we have some strict guarantees about the rewritten alloca: // - The new alloca is exactly the size of the vector type here. // - The accesses all either map to the entire vector or to a single // element. // - The set of accessing instructions is only one of those handled above // in isVectorPromotionViable. Generally these are the same access kinds // which are promotable via mem2reg. VectorType *VecTy; Type *ElementTy; uint64_t ElementSize; // The original offset of the slice currently being rewritten relative to // the original alloca. uint64_t BeginOffset, EndOffset; // The new offsets of the slice currently being rewritten relative to the // original alloca. uint64_t NewBeginOffset, NewEndOffset; uint64_t SliceSize; bool IsSplittable; bool IsSplit; Use *OldUse; Instruction *OldPtr; // Track post-rewrite users which are PHI nodes and Selects. SmallPtrSetImpl &PHIUsers; SmallPtrSetImpl &SelectUsers; // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. IRBuilderTy IRB; public: AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, VectorType *PromotableVecTy, SmallPtrSetImpl &PHIUsers, SmallPtrSetImpl &SelectUsers) : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), NewAllocaBeginOffset(NewAllocaBeginOffset), NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAI.getAllocatedType()), IntTy(IsIntegerPromotable ? Type::getIntNTy( NewAI.getContext(), DL.getTypeSizeInBits(NewAI.getAllocatedType())) : nullptr), VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); ++NumVectorized; } assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy)); } bool visit(AllocaSlices::const_iterator I) { bool CanSROA = true; BeginOffset = I->beginOffset(); EndOffset = I->endOffset(); IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); DEBUG(AS.printSlice(dbgs(), I, "")); DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); assert(EndOffset > NewAllocaBeginOffset); NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); SliceSize = NewEndOffset - NewBeginOffset; OldUse = I->getUse(); OldPtr = cast(OldUse->get()); Instruction *OldUserI = cast(OldUse->getUser()); IRB.SetInsertPoint(OldUserI); IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); CanSROA &= visit(cast(OldUse->getUser())); if (VecTy || IntTy) assert(CanSROA); return CanSROA; } private: // Make sure the other visit overloads are visible. using Base::visit; // Every instruction which can end up as a user must have a rewrite rule. bool visitInstruction(Instruction &I) { DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); llvm_unreachable("No rewrite rule for this instruction!"); } Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { // Note that the offset computation can use BeginOffset or NewBeginOffset // interchangeably for unsplit slices. assert(IsSplit || BeginOffset == NewBeginOffset); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; #ifndef NDEBUG StringRef OldName = OldPtr->getName(); // Skip through the last '.sroa.' component of the name. size_t LastSROAPrefix = OldName.rfind(".sroa."); if (LastSROAPrefix != StringRef::npos) { OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); // Look for an SROA slice index. size_t IndexEnd = OldName.find_first_not_of("0123456789"); if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { // Strip the index and look for the offset. OldName = OldName.substr(IndexEnd + 1); size_t OffsetEnd = OldName.find_first_not_of("0123456789"); if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') // Strip the offset. OldName = OldName.substr(OffsetEnd + 1); } } // Strip any SROA suffixes as well. OldName = OldName.substr(0, OldName.find(".sroa_")); #endif return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(), Offset), PointerTy, #ifndef NDEBUG Twine(OldName) + "." #else Twine() #endif ); } /// \brief Compute suitable alignment to access this slice of the *new* /// alloca. /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. unsigned getSliceAlign(Type *Ty = nullptr) { unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; } unsigned getIndex(uint64_t Offset) { assert(VecTy && "Can only call getIndex when rewriting a vector"); uint64_t RelOffset = Offset - NewAllocaBeginOffset; assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); uint32_t Index = RelOffset / ElementSize; assert(Index * ElementSize == RelOffset); return Index; } void deleteIfTriviallyDead(Value *V) { Instruction *I = cast(V); if (isInstructionTriviallyDead(I)) Pass.DeadInsts.insert(I); } Value *rewriteVectorizedLoadInst() { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } Value *rewriteIntegerLoad(LoadInst &LI) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); } // It is possible that the extracted type is not the load type. This // happens if there is a load past the end of the alloca, and as // a consequence the slice is narrower but still a candidate for integer // lowering. To handle this case, we just zero extend the extracted // integer. assert(cast(LI.getType())->getBitWidth() >= SliceSize * 8 && "Can only handle an extract for an overly wide load"); if (cast(LI.getType())->getBitWidth() > SliceSize * 8) V = IRB.CreateZExt(V, LI.getType()); return V; } bool visitLoadInst(LoadInst &LI) { DEBUG(dbgs() << " original: " << LI << "\n"); Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { V = rewriteVectorizedLoadInst(); } else if (IntTy && LI.getType()->isIntegerTy()) { V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, NewAllocaTy, TargetTy) || (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy()))) { LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); V = NewLI; // If this is an integer load past the end of the slice (which means the // bytes outside the slice are undef or this load is dead) just forcibly // fix the integer size with correct handling of endianness. if (auto *AITy = dyn_cast(NewAllocaTy)) if (auto *TITy = dyn_cast(TargetTy)) if (AITy->getBitWidth() < TITy->getBitWidth()) { V = IRB.CreateZExt(V, TITy, "load.ext"); if (DL.isBigEndian()) V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), "endian_shift"); } } else { Type *LTy = TargetTy->getPointerTo(); LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(TargetTy), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); V = NewLI; IsPtrAdjusted = true; } V = convertValue(DL, IRB, V, TargetTy); if (IsSplit) { assert(!LI.isVolatile()); assert(LI.getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. Value *Placeholder = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); delete Placeholder; } else { LI.replaceAllUsesWith(V); } Pass.DeadInsts.insert(&LI); deleteIfTriviallyDead(OldOp); DEBUG(dbgs() << " to: " << *V << "\n"); return !LI.isVolatile() && !IsPtrAdjusted; } bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { if (V->getType() != VecTy) { unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); Type *SliceTy = (NumElements == 1) ? ElementTy : VectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) V = convertValue(DL, IRB, V, SliceTy); // Mix in the existing elements. Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); V = insertVector(IRB, Old, V, BeginIndex, "vec"); } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return true; } bool rewriteIntegerStore(Value *V, StoreInst &SI) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return true; } bool visitStoreInst(StoreInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); Value *OldOp = SI.getOperand(1); assert(OldOp == OldPtr); Value *V = SI.getValueOperand(); // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after promoting this alloca. if (V->getType()->isPointerTy()) if (AllocaInst *AI = dyn_cast(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); if (SliceSize < DL.getTypeStoreSize(V->getType())) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, "extract"); } if (VecTy) return rewriteVectorizedStoreInst(V, SI, OldOp); if (IntTy && V->getType()->isIntegerTy()) return rewriteIntegerStore(V, SI); const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize; StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, V->getType(), NewAllocaTy) || (IsStorePastEnd && NewAllocaTy->isIntegerTy() && V->getType()->isIntegerTy()))) { // If this is an integer store past the end of slice (and thus the bytes // past that point are irrelevant or this is unreachable), truncate the // value prior to storing. if (auto *VITy = dyn_cast(V->getType())) if (auto *AITy = dyn_cast(NewAllocaTy)) if (VITy->getBitWidth() > AITy->getBitWidth()) { if (DL.isBigEndian()) V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), "endian_shift"); V = IRB.CreateTrunc(V, AITy, "load.trunc"); } V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), SI.isVolatile()); } if (SI.isVolatile()) NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope()); Pass.DeadInsts.insert(&SI); deleteIfTriviallyDead(OldOp); DEBUG(dbgs() << " to: " << *NewSI << "\n"); return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); } /// \brief Compute an integer value from splatting an i8 across the given /// number of bytes. /// /// Note that this routine assumes an i8 is a byte. If that isn't true, don't /// call this routine. /// FIXME: Heed the advice above. /// /// \param V The i8 value to splat. /// \param Size The number of bytes in the output (assuming i8 is one byte) Value *getIntegerSplat(Value *V, unsigned Size) { assert(Size > 0 && "Expected a positive number of bytes."); IntegerType *VTy = cast(V->getType()); assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); if (Size == 1) return V; Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); V = IRB.CreateMul( IRB.CreateZExt(V, SplatIntTy, "zext"), ConstantExpr::getUDiv( Constant::getAllOnesValue(SplatIntTy), ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()), SplatIntTy)), "isplat"); return V; } /// \brief Compute a vector splat for a given element value. Value *getVectorSplat(Value *V, unsigned NumElements) { V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); DEBUG(dbgs() << " splat: " << *V << "\n"); return V; } bool visitMemSetInst(MemSetInst &II) { DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getRawDest() == OldPtr); // If the memset has a variable size, it cannot be split, just adjust the // pointer to the new alloca. if (!isa(II.getLength())) { assert(!IsSplit); assert(NewBeginOffset == BeginOffset); II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); Type *CstTy = II.getAlignmentCst()->getType(); II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); deleteIfTriviallyDead(OldPtr); return false; } // Record this instruction for deletion. Pass.DeadInsts.insert(&II); Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || SliceSize != DL.getTypeStoreSize(AllocaTy) || !AllocaTy->isSingleValueType() || !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) { Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, getSliceAlign(), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; } // If we can represent this as a simple value, we have to build the actual // value to store, which requires expanding the byte present in memset to // a sensible representation for the alloca type. This is essentially // splatting the byte to a sufficiently wide integer, splatting it across // any desired vector width, and bitcasting to the final type. Value *V; if (VecTy) { // If this is a memset of a vectorized alloca, insert it. assert(ElementTy == ScalarTy); unsigned BeginIndex = getIndex(NewBeginOffset); unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); Value *Splat = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8); Splat = convertValue(DL, IRB, Splat, ElementTy); if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); } else if (IntTy) { // If this is a memset on an alloca where we can widen stores, insert the // set integer. assert(!II.isVolatile()); uint64_t Size = NewEndOffset - NewBeginOffset; V = getIntegerSplat(II.getValue(), Size); if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); } else { assert(V->getType() == IntTy && "Wrong type for an alloca wide integer!"); } V = convertValue(DL, IRB, V, AllocaTy); } else { // Established these invariants above. assert(NewBeginOffset == NewAllocaBeginOffset); assert(NewEndOffset == NewAllocaEndOffset); V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8); if (VectorType *AllocaVecTy = dyn_cast(AllocaTy)) V = getVectorSplat(V, AllocaVecTy->getNumElements()); V = convertValue(DL, IRB, V, AllocaTy); } Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return !II.isVolatile(); } bool visitMemTransferInst(MemTransferInst &II) { // Rewriting of memory transfer instructions can be a bit tricky. We break // them into two categories: split intrinsics and unsplit intrinsics. DEBUG(dbgs() << " original: " << II << "\n"); bool IsDest = &II.getRawDestUse() == OldUse; assert((IsDest && II.getRawDest() == OldPtr) || (!IsDest && II.getRawSource() == OldPtr)); unsigned SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of // correctness. With unsplit intrinsics we may be dealing with transfers // within a single alloca before SROA ran, or with transfers that have // a variable length. We may also be dealing with memmove instead of // memcpy, and so simply updating the pointers is the necessary for us to // update both source and dest of a single call. if (!IsSplittable) { Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); if (IsDest) II.setDest(AdjustedPtr); else II.setSource(AdjustedPtr); if (II.getAlignment() > SliceAlign) { Type *CstTy = II.getAlignmentCst()->getType(); II.setAlignment( ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); } DEBUG(dbgs() << " to: " << II << "\n"); deleteIfTriviallyDead(OldPtr); return false; } // For split transfer intrinsics we have an incredibly useful assurance: // the source and destination do not reside within the same alloca, and at // least one of them does not escape. This means that we can replace // memmove with memcpy, and we don't need to worry about all manner of // downsides to splitting and transforming the operations. // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memcpy. bool EmitMemCpy = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) || !NewAI.getAllocatedType()->isSingleValueType()); // If we're just going to emit a memcpy, the alloca hasn't changed, and the // size hasn't been shrunk based on analysis of the viable range, this is // a no-op. if (EmitMemCpy && &OldAI == &NewAI) { // Ensure the start lines up. assert(NewBeginOffset == BeginOffset); // Rewrite the size as needed. if (NewEndOffset != EndOffset) II.setLength(ConstantInt::get(II.getLength()->getType(), NewEndOffset - NewBeginOffset)); return false; } // Record this instruction for deletion. Pass.DeadInsts.insert(&II); // Strip all inbounds GEPs and pointer casts to try to dig out any root // alloca that should be re-examined after rewriting this instruction. Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); if (AllocaInst *AI = dyn_cast(OtherPtr->stripInBoundsOffsets())) { assert(AI != &OldAI && AI != &NewAI && "Splittable transfers cannot reach the same alloca on both ends."); Pass.Worklist.insert(AI); } Type *OtherPtrTy = OtherPtr->getType(); unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); // Compute the relative offset for the other pointer within the transfer. unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, OtherOffset.zextOrTrunc(64).getZExtValue()); if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce // a single, simple GEP in most cases. OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemCpy( IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, MinAlign(SliceAlign, OtherAlign), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; } bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset; uint64_t Size = NewEndOffset - NewBeginOffset; unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; IntegerType *SubIntTy = IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; // Reset the other pointer type to match the register type we're going to // use, but using the address space of the original other pointer. if (VecTy && !IsWholeAlloca) { if (NumElements == 1) OtherPtrTy = VecTy->getElementType(); else OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS); } else if (IntTy && !IsWholeAlloca) { OtherPtrTy = SubIntTy->getPointerTo(OtherAS); } else { OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS); } Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); unsigned SrcAlign = OtherAlign; Value *DstPtr = &NewAI; unsigned DstAlign = SliceAlign; if (!IsDest) { std::swap(SrcPtr, DstPtr); std::swap(SrcAlign, DstAlign); } Value *Src; if (VecTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); Src = convertValue(DL, IRB, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload"); } if (VecTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); Old = convertValue(DL, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); Src = convertValue(DL, IRB, Src, NewAllocaTy); } StoreInst *Store = cast( IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return !II.isVolatile(); } bool visitIntrinsicInst(IntrinsicInst &II) { assert(II.getIntrinsicID() == Intrinsic::lifetime_start || II.getIntrinsicID() == Intrinsic::lifetime_end); DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); // Record this instruction for deletion. Pass.DeadInsts.insert(&II); ConstantInt *Size = ConstantInt::get(cast(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); else New = IRB.CreateLifetimeEnd(Ptr, Size); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return true; } bool visitPHINode(PHINode &PN) { DEBUG(dbgs() << " original: " << PN << "\n"); assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); // We would like to compute a new pointer in only one place, but have it be // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. IRBuilderTy PtrBuilder(IRB); if (isa(OldPtr)) PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); else PtrBuilder.SetInsertPoint(OldPtr); PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType()); // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); // PHIs can't be promoted on their own, but often can be speculated. We // check the speculation outside of the rewriter so that we see the // fully-rewritten alloca. PHIUsers.insert(&PN); return true; } bool visitSelectInst(SelectInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && "Pointer isn't an operand!"); assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); // Replace the operands which were using the old pointer. if (SI.getOperand(1) == OldPtr) SI.setOperand(1, NewPtr); if (SI.getOperand(2) == OldPtr) SI.setOperand(2, NewPtr); DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); // Selects can't be promoted on their own, but often can be speculated. We // check the speculation outside of the rewriter so that we see the // fully-rewritten alloca. SelectUsers.insert(&SI); return true; } }; namespace { /// \brief Visitor to rewrite aggregate loads and stores as scalar. /// /// This pass aggressively rewrites all aggregate loads and stores on /// a particular pointer (or any pointer derived from it which we can identify) /// with scalar loads and stores. class AggLoadStoreRewriter : public InstVisitor { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor; /// Queue of pointer uses to analyze and potentially rewrite. SmallVector Queue; /// Set to prevent us from cycling with phi nodes and loops. SmallPtrSet Visited; /// The current pointer use being rewritten. This is used to dig up the used /// value (as opposed to the user). Use *U; public: /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); enqueueUsers(I); bool Changed = false; while (!Queue.empty()) { U = Queue.pop_back_val(); Changed |= visit(cast(U->getUser())); } return Changed; } private: /// Enqueue all the users of the given instruction for further processing. /// This uses a set to de-duplicate users. void enqueueUsers(Instruction &I) { for (Use &U : I.uses()) if (Visited.insert(U.getUser()).second) Queue.push_back(&U); } // Conservative default is to not rewrite anything. bool visitInstruction(Instruction &I) { return false; } /// \brief Generic recursive split emission class. template class OpSplitter { protected: /// The builder used to form new instructions. IRBuilderTy IRB; /// The indices which to be used with insert- or extractvalue to select the /// appropriate value within the aggregate. SmallVector Indices; /// The indices to a GEP instruction which will move Ptr to the correct slot /// within the aggregate. SmallVector GEPIndices; /// The base pointer of the original op, used as a base for GEPing the /// split operations. Value *Ptr; /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr) : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} public: /// \brief Generic recursive split emission routine. /// /// This method recursively splits an aggregate op (load or store) into /// scalar or vector ops. It splits recursively until it hits a single value /// and emits that single value operation via the template argument. /// /// The logic of this routine relies on GEPs and insertvalue and /// extractvalue all operating with the same fundamental index list, merely /// formatted differently (GEPs need actual values). /// /// \param Ty The type being split recursively into smaller ops. /// \param Agg The aggregate value being built up or stored, depending on /// whether this is splitting a load or a store respectively. void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { if (Ty->isSingleValueType()) return static_cast(this)->emitFunc(Ty, Agg, Name); if (ArrayType *ATy = dyn_cast(Ty)) { unsigned OldSize = Indices.size(); (void)OldSize; for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; ++Idx) { assert(Indices.size() == OldSize && "Did not return to the old size"); Indices.push_back(Idx); GEPIndices.push_back(IRB.getInt32(Idx)); emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); GEPIndices.pop_back(); Indices.pop_back(); } return; } if (StructType *STy = dyn_cast(Ty)) { unsigned OldSize = Indices.size(); (void)OldSize; for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; ++Idx) { assert(Indices.size() == OldSize && "Did not return to the old size"); Indices.push_back(Idx); GEPIndices.push_back(IRB.getInt32(Idx)); emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); GEPIndices.pop_back(); Indices.pop_back(); } return; } llvm_unreachable("Only arrays and structs are aggregate loadable types"); } }; struct LoadOpSplitter : public OpSplitter { LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) : OpSplitter(InsertionPoint, Ptr) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); Value *Load = IRB.CreateLoad(GEP, Name + ".load"); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); DEBUG(dbgs() << " to: " << *Load << "\n"); } }; bool visitLoadInst(LoadInst &LI) { assert(LI.getPointerOperand() == *U); if (!LI.isSimple() || LI.getType()->isSingleValueType()) return false; // We have an aggregate being loaded, split it apart. DEBUG(dbgs() << " original: " << LI << "\n"); LoadOpSplitter Splitter(&LI, *U); Value *V = UndefValue::get(LI.getType()); Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); LI.replaceAllUsesWith(V); LI.eraseFromParent(); return true; } struct StoreOpSplitter : public OpSplitter { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) : OpSplitter(InsertionPoint, Ptr) {} /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } }; bool visitStoreInst(StoreInst &SI) { if (!SI.isSimple() || SI.getPointerOperand() != *U) return false; Value *V = SI.getValueOperand(); if (V->getType()->isSingleValueType()) return false; // We have an aggregate being stored, split it apart. DEBUG(dbgs() << " original: " << SI << "\n"); StoreOpSplitter Splitter(&SI, *U); Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); SI.eraseFromParent(); return true; } bool visitBitCastInst(BitCastInst &BC) { enqueueUsers(BC); return false; } bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { enqueueUsers(GEPI); return false; } bool visitPHINode(PHINode &PN) { enqueueUsers(PN); return false; } bool visitSelectInst(SelectInst &SI) { enqueueUsers(SI); return false; } }; } /// \brief Strip aggregate type wrapping. /// /// This removes no-op aggregate types wrapping an underlying type. It will /// strip as many layers of types as it can without changing either the type /// size or the allocated size. static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { if (Ty->isSingleValueType()) return Ty; uint64_t AllocSize = DL.getTypeAllocSize(Ty); uint64_t TypeSize = DL.getTypeSizeInBits(Ty); Type *InnerTy; if (ArrayType *ArrTy = dyn_cast(Ty)) { InnerTy = ArrTy->getElementType(); } else if (StructType *STy = dyn_cast(Ty)) { const StructLayout *SL = DL.getStructLayout(STy); unsigned Index = SL->getElementContainingOffset(0); InnerTy = STy->getElementType(Index); } else { return Ty; } if (AllocSize > DL.getTypeAllocSize(InnerTy) || TypeSize > DL.getTypeSizeInBits(InnerTy)) return Ty; return stripAggregateTypeWrapping(DL, InnerTy); } /// \brief Try to find a partition of the aggregate type passed in for a given /// offset and size. /// /// This recurses through the aggregate type and tries to compute a subtype /// based on the offset and size. When the offset and size span a sub-section /// of an array, it will even compute a new array type for that sub-section, /// and the same for structs. /// /// Note that this routine is very strict and tries to find a partition of the /// type which produces the *exact* right offset and size. It is not forgiving /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size) { if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) return stripAggregateTypeWrapping(DL, Ty); if (Offset > DL.getTypeAllocSize(Ty) || (DL.getTypeAllocSize(Ty) - Offset) < Size) return nullptr; if (SequentialType *SeqTy = dyn_cast(Ty)) { // We can't partition pointers... if (SeqTy->isPointerTy()) return nullptr; Type *ElementTy = SeqTy->getElementType(); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); uint64_t NumSkippedElements = Offset / ElementSize; if (ArrayType *ArrTy = dyn_cast(SeqTy)) { if (NumSkippedElements >= ArrTy->getNumElements()) return nullptr; } else if (VectorType *VecTy = dyn_cast(SeqTy)) { if (NumSkippedElements >= VecTy->getNumElements()) return nullptr; } Offset -= NumSkippedElements * ElementSize; // First check if we need to recurse. if (Offset > 0 || Size < ElementSize) { // Bail if the partition ends in a different array element. if ((Offset + Size) > ElementSize) return nullptr; // Recurse through the element type trying to peel off offset bytes. return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) return stripAggregateTypeWrapping(DL, ElementTy); assert(Size > ElementSize); uint64_t NumElements = Size / ElementSize; if (NumElements * ElementSize != Size) return nullptr; return ArrayType::get(ElementTy, NumElements); } StructType *STy = dyn_cast(Ty); if (!STy) return nullptr; const StructLayout *SL = DL.getStructLayout(STy); if (Offset >= SL->getSizeInBytes()) return nullptr; uint64_t EndOffset = Offset + Size; if (EndOffset > SL->getSizeInBytes()) return nullptr; unsigned Index = SL->getElementContainingOffset(Offset); Offset -= SL->getElementOffset(Index); Type *ElementTy = STy->getElementType(Index); uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); if (Offset >= ElementSize) return nullptr; // The offset points into alignment padding. // See if any partition must be contained by the element. if (Offset > 0 || Size < ElementSize) { if ((Offset + Size) > ElementSize) return nullptr; return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) return stripAggregateTypeWrapping(DL, ElementTy); StructType::element_iterator EI = STy->element_begin() + Index, EE = STy->element_end(); if (EndOffset < SL->getSizeInBytes()) { unsigned EndIndex = SL->getElementContainingOffset(EndOffset); if (Index == EndIndex) return nullptr; // Within a single element and its padding. // Don't try to form "natural" types if the elements don't line up with the // expected size. // FIXME: We could potentially recurse down through the last element in the // sub-struct to find a natural end point. if (SL->getElementOffset(EndIndex) != EndOffset) return nullptr; assert(Index < EndIndex); EE = STy->element_begin() + EndIndex; } // Try to build up a sub-structure. StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return nullptr; // The sub-struct doesn't have quite the size needed. return SubTy; } /// \brief Pre-split loads and stores to simplify rewriting. /// /// We want to break up the splittable load+store pairs as much as /// possible. This is important to do as a preprocessing step, as once we /// start rewriting the accesses to partitions of the alloca we lose the /// necessary information to correctly split apart paired loads and stores /// which both point into this alloca. The case to consider is something like /// the following: /// /// %a = alloca [12 x i8] /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 /// %iptr1 = bitcast i8* %gep1 to i64* /// %iptr2 = bitcast i8* %gep2 to i64* /// %fptr1 = bitcast i8* %gep1 to float* /// %fptr2 = bitcast i8* %gep2 to float* /// %fptr3 = bitcast i8* %gep3 to float* /// store float 0.0, float* %fptr1 /// store float 1.0, float* %fptr2 /// %v = load i64* %iptr1 /// store i64 %v, i64* %iptr2 /// %f1 = load float* %fptr2 /// %f2 = load float* %fptr3 /// /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and /// promote everything so we recover the 2 SSA values that should have been /// there all along. /// /// \returns true if any changes are made. bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { DEBUG(dbgs() << "Pre-splitting loads and stores\n"); // Track the loads and stores which are candidates for pre-splitting here, in // the order they first appear during the partition scan. These give stable // iteration order and a basis for tracking which loads and stores we // actually split. SmallVector Loads; SmallVector Stores; // We need to accumulate the splits required of each load or store where we // can find them via a direct lookup. This is important to cross-check loads // and stores against each other. We also track the slice so that we can kill // all the slices that end up split. struct SplitOffsets { Slice *S; std::vector Splits; }; SmallDenseMap SplitOffsetsMap; // Track loads out of this alloca which cannot, for any reason, be pre-split. // This is important as we also cannot pre-split stores of those loads! // FIXME: This is all pretty gross. It means that we can be more aggressive // in pre-splitting when the load feeding the store happens to come from // a separate alloca. Put another way, the effectiveness of SROA would be // decreased by a frontend which just concatenated all of its local allocas // into one big flat alloca. But defeating such patterns is exactly the job // SROA is tasked with! Sadly, to not have this discrepancy we would have // change store pre-splitting to actually force pre-splitting of the load // that feeds it *and all stores*. That makes pre-splitting much harder, but // maybe it would make it more principled? SmallPtrSet UnsplittableLoads; DEBUG(dbgs() << " Searching for candidate loads and stores\n"); for (auto &P : AS.partitions()) { for (Slice &S : P) { Instruction *I = cast(S.getUse()->getUser()); if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { // If this was a load we have to track that it can't participate in any // pre-splitting! if (auto *LI = dyn_cast(I)) UnsplittableLoads.insert(LI); continue; } assert(P.endOffset() > S.beginOffset() && "Empty or backwards partition!"); // Determine if this is a pre-splittable slice. if (auto *LI = dyn_cast(I)) { assert(!LI->isVolatile() && "Cannot split volatile loads!"); // The load must be used exclusively to store into other pointers for // us to be able to arbitrarily pre-split it. The stores must also be // simple to avoid changing semantics. auto IsLoadSimplyStored = [](LoadInst *LI) { for (User *LU : LI->users()) { auto *SI = dyn_cast(LU); if (!SI || !SI->isSimple()) return false; } return true; }; if (!IsLoadSimplyStored(LI)) { UnsplittableLoads.insert(LI); continue; } Loads.push_back(LI); } else if (auto *SI = dyn_cast(S.getUse()->getUser())) { if (!SI || S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) continue; auto *StoredLoad = dyn_cast(SI->getValueOperand()); if (!StoredLoad || !StoredLoad->isSimple()) continue; assert(!SI->isVolatile() && "Cannot split volatile stores!"); Stores.push_back(SI); } else { // Other uses cannot be pre-split. continue; } // Record the initial split. DEBUG(dbgs() << " Candidate: " << *I << "\n"); auto &Offsets = SplitOffsetsMap[I]; assert(Offsets.Splits.empty() && "Should not have splits the first time we see an instruction!"); Offsets.S = &S; Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); } // Now scan the already split slices, and add a split for any of them which // we're going to pre-split. for (Slice *S : P.splitSliceTails()) { auto SplitOffsetsMapI = SplitOffsetsMap.find(cast(S->getUse()->getUser())); if (SplitOffsetsMapI == SplitOffsetsMap.end()) continue; auto &Offsets = SplitOffsetsMapI->second; assert(Offsets.S == S && "Found a mismatched slice!"); assert(!Offsets.Splits.empty() && "Cannot have an empty set of splits on the second partition!"); assert(Offsets.Splits.back() == P.beginOffset() - Offsets.S->beginOffset() && "Previous split does not end where this one begins!"); // Record each split. The last partition's end isn't needed as the size // of the slice dictates that. if (S->endOffset() > P.endOffset()) Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); } } // We may have split loads where some of their stores are split stores. For // such loads and stores, we can only pre-split them if their splits exactly // match relative to their starting offset. We have to verify this prior to // any rewriting. Stores.erase( std::remove_if(Stores.begin(), Stores.end(), [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { // Lookup the load we are storing in our map of split // offsets. auto *LI = cast(SI->getValueOperand()); // If it was completely unsplittable, then we're done, // and this store can't be pre-split. if (UnsplittableLoads.count(LI)) return true; auto LoadOffsetsI = SplitOffsetsMap.find(LI); if (LoadOffsetsI == SplitOffsetsMap.end()) return false; // Unrelated loads are definitely safe. auto &LoadOffsets = LoadOffsetsI->second; // Now lookup the store's offsets. auto &StoreOffsets = SplitOffsetsMap[SI]; // If the relative offsets of each split in the load and // store match exactly, then we can split them and we // don't need to remove them here. if (LoadOffsets.Splits == StoreOffsets.Splits) return false; DEBUG(dbgs() << " Mismatched splits for load and store:\n" << " " << *LI << "\n" << " " << *SI << "\n"); // We've found a store and load that we need to split // with mismatched relative splits. Just give up on them // and remove both instructions from our list of // candidates. UnsplittableLoads.insert(LI); return true; }), Stores.end()); // Now we have to go *back* through all the stores, because a later store may // have caused an earlier store's load to become unsplittable and if it is // unsplittable for the later store, then we can't rely on it being split in // the earlier store either. Stores.erase(std::remove_if(Stores.begin(), Stores.end(), [&UnsplittableLoads](StoreInst *SI) { auto *LI = cast(SI->getValueOperand()); return UnsplittableLoads.count(LI); }), Stores.end()); // Once we've established all the loads that can't be split for some reason, // filter any that made it into our list out. Loads.erase(std::remove_if(Loads.begin(), Loads.end(), [&UnsplittableLoads](LoadInst *LI) { return UnsplittableLoads.count(LI); }), Loads.end()); // If no loads or stores are left, there is no pre-splitting to be done for // this alloca. if (Loads.empty() && Stores.empty()) return false; // From here on, we can't fail and will be building new accesses, so rig up // an IR builder. IRBuilderTy IRB(&AI); // Collect the new slices which we will merge into the alloca slices. SmallVector NewSlices; // Track any allocas we end up splitting loads and stores for so we iterate // on them. SmallPtrSet ResplitPromotableAllocas; // At this point, we have collected all of the loads and stores we can // pre-split, and the specific splits needed for them. We actually do the // splitting in a specific order in order to handle when one of the loads in // the value operand to one of the stores. // // First, we rewrite all of the split loads, and just accumulate each split // load in a parallel structure. We also build the slices for them and append // them to the alloca slices. SmallDenseMap, 1> SplitLoadsMap; std::vector SplitLoads; const DataLayout &DL = AI.getModule()->getDataLayout(); for (LoadInst *LI : Loads) { SplitLoads.clear(); IntegerType *Ty = cast(LI->getType()); uint64_t LoadSize = Ty->getBitWidth() / 8; assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); auto &Offsets = SplitOffsetsMap[LI]; assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && "Slice size should always match load size exactly!"); uint64_t BaseOffset = Offsets.S->beginOffset(); assert(BaseOffset + LoadSize > BaseOffset && "Cannot represent alloca access size using 64-bit integers!"); Instruction *BasePtr = cast(LI->getPointerOperand()); IRB.SetInsertPoint(LI); DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); LoadInst *PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, BasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, BasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); // Append this load onto the list of split loads so we can find it later // to rewrite the stores. SplitLoads.push_back(PLoad); // Now build a new slice for the alloca. NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), /*IsSplittable*/ false)); DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() << ", " << NewSlices.back().endOffset() << "): " << *PLoad << "\n"); // See if we've handled all the splits. if (Idx >= Size) break; // Setup the next partition. PartOffset = Offsets.Splits[Idx]; ++Idx; PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; } // Now that we have the split loads, do the slow walk over all uses of the // load and rewrite them as split stores, or save the split loads to use // below if the store is going to be split there anyways. bool DeferredStores = false; for (User *LU : LI->users()) { StoreInst *SI = cast(LU); if (!Stores.empty() && SplitOffsetsMap.count(SI)) { DeferredStores = true; DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); continue; } Value *StoreBasePtr = SI->getPointerOperand(); IRB.SetInsertPoint(SI); DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { LoadInst *PLoad = SplitLoads[Idx]; uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; auto *PartPtrTy = PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); (void)PStore; DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); } // We want to immediately iterate on any allocas impacted by splitting // this store, and we have to track any promotable alloca (indicated by // a direct store) as needing to be resplit because it is no longer // promotable. if (AllocaInst *OtherAI = dyn_cast(StoreBasePtr)) { ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( StoreBasePtr->stripInBoundsOffsets())) { Worklist.insert(OtherAI); } // Mark the original store as dead. DeadInsts.insert(SI); } // Save the split loads if there are deferred stores among the users. if (DeferredStores) SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); // Mark the original load as dead and kill the original slice. DeadInsts.insert(LI); Offsets.S->kill(); } // Second, we rewrite all of the split stores. At this point, we know that // all loads from this alloca have been split already. For stores of such // loads, we can simply look up the pre-existing split loads. For stores of // other loads, we split those loads first and then write split stores of // them. for (StoreInst *SI : Stores) { auto *LI = cast(SI->getValueOperand()); IntegerType *Ty = cast(LI->getType()); uint64_t StoreSize = Ty->getBitWidth() / 8; assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); auto &Offsets = SplitOffsetsMap[SI]; assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && "Slice size should always match load size exactly!"); uint64_t BaseOffset = Offsets.S->beginOffset(); assert(BaseOffset + StoreSize > BaseOffset && "Cannot represent alloca access size using 64-bit integers!"); Value *LoadBasePtr = LI->getPointerOperand(); Instruction *StoreBasePtr = cast(SI->getPointerOperand()); DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); // Check whether we have an already split load. auto SplitLoadsMapI = SplitLoadsMap.find(LI); std::vector *SplitLoads = nullptr; if (SplitLoadsMapI != SplitLoadsMap.end()) { SplitLoads = &SplitLoadsMapI->second; assert(SplitLoads->size() == Offsets.Splits.size() + 1 && "Too few split loads for the number of splits in the store!"); } else { DEBUG(dbgs() << " of load: " << *LI << "\n"); } uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); // Either lookup a split load or create one. LoadInst *PLoad; if (SplitLoads) { PLoad = (*SplitLoads)[Idx]; } else { IRB.SetInsertPoint(LI); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, LoadBasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); } // And store this partition. IRB.SetInsertPoint(SI); StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PStore->getOperandUse(PStore->getPointerOperandIndex()), /*IsSplittable*/ false)); DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() << ", " << NewSlices.back().endOffset() << "): " << *PStore << "\n"); if (!SplitLoads) { DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); } // See if we've finished all the splits. if (Idx >= Size) break; // Setup the next partition. PartOffset = Offsets.Splits[Idx]; ++Idx; PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; } // We want to immediately iterate on any allocas impacted by splitting // this load, which is only relevant if it isn't a load of this alloca and // thus we didn't already split the loads above. We also have to keep track // of any promotable allocas we split loads on as they can no longer be // promoted. if (!SplitLoads) { if (AllocaInst *OtherAI = dyn_cast(LoadBasePtr)) { assert(OtherAI != &AI && "We can't re-split our own alloca!"); ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( LoadBasePtr->stripInBoundsOffsets())) { assert(OtherAI != &AI && "We can't re-split our own alloca!"); Worklist.insert(OtherAI); } } // Mark the original store as dead now that we've split it up and kill its // slice. Note that we leave the original load in place unless this store // was its only use. It may in turn be split up if it is an alloca load // for some other alloca, but it may be a normal load. This may introduce // redundant loads, but where those can be merged the rest of the optimizer // should handle the merging, and this uncovers SSA splits which is more // important. In practice, the original loads will almost always be fully // split and removed eventually, and the splits will be merged by any // trivial CSE, including instcombine. if (LI->hasOneUse()) { assert(*LI->user_begin() == SI && "Single use isn't this store!"); DeadInsts.insert(LI); } DeadInsts.insert(SI); Offsets.S->kill(); } // Remove the killed slices that have ben pre-split. AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { return S.isDead(); }), AS.end()); // Insert our new slices. This will sort and merge them into the sorted // sequence. AS.insert(NewSlices); DEBUG(dbgs() << " Pre-split slices:\n"); #ifndef NDEBUG for (auto I = AS.begin(), E = AS.end(); I != E; ++I) DEBUG(AS.print(dbgs(), I, " ")); #endif // Finally, don't try to promote any allocas that new require re-splitting. // They have already been added to the worklist above. PromotableAllocas.erase( std::remove_if( PromotableAllocas.begin(), PromotableAllocas.end(), [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), PromotableAllocas.end()); return true; } /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries /// to rewrite uses of an alloca partition to be conducive for SSA value /// promotion. If the partition needs a new, more refined alloca, this will /// build that new alloca, preserving as much type information as possible, and /// rewrite the uses of the old alloca to point at the new one and have the /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; const DataLayout &DL = AI.getModule()->getDataLayout(); if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) if (DL.getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && DL.isLegalInteger(P.size() * 8)) SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); assert(DL.getTypeAllocSize(SliceTy) >= P.size()); bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); if (VecTy) SliceTy = VecTy; // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that // case, re-use the existing alloca, but still run through the rewriter to // perform phi and select speculation. AllocaInst *NewAI; if (SliceTy == AI.getAllocatedType()) { assert(P.beginOffset() == 0 && "Non-zero begin offset but same alloca type"); NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. // FIXME: return nullptr; } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); } Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. if (Alignment <= DL.getABITypeAlignment(SliceTy)) Alignment = 0; NewAI = new AllocaInst( SliceTy, nullptr, Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset() << "," << P.endOffset() << ") to: " << *NewAI << "\n"); // Track the high watermark on the worklist as it is only relevant for // promoted allocas. We will reset it to this point if the alloca is not in // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); unsigned NumUses = 0; SmallPtrSet PHIUsers; SmallPtrSet SelectUsers; AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; for (Slice *S : P.splitSliceTails()) { Promotable &= Rewriter.visit(S); ++NumUses; } for (Slice &S : P) { Promotable &= Rewriter.visit(&S); ++NumUses; } NumAllocaPartitionUses += NumUses; MaxUsesPerAllocaPartition = std::max(NumUses, MaxUsesPerAllocaPartition); // Now that we've processed all the slices in the new partition, check if any // PHIs or Selects would block promotion. for (SmallPtrSetImpl::iterator I = PHIUsers.begin(), E = PHIUsers.end(); I != E; ++I) if (!isSafePHIToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); break; } for (SmallPtrSetImpl::iterator I = SelectUsers.begin(), E = SelectUsers.end(); I != E; ++I) if (!isSafeSelectToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); break; } if (Promotable) { if (PHIUsers.empty() && SelectUsers.empty()) { // Promote the alloca. PromotableAllocas.push_back(NewAI); } else { // If we have either PHIs or Selects to speculate, add them to those // worklists and re-queue the new alloca so that we promote in on the // next iteration. for (PHINode *PHIUser : PHIUsers) SpeculatablePHIs.insert(PHIUser); for (SelectInst *SelectUser : SelectUsers) SpeculatableSelects.insert(SelectUser); Worklist.insert(NewAI); } } else { // If we can't promote the alloca, iterate on it to check for new // refinements exposed by splitting the current alloca. Don't iterate on an // alloca which didn't actually change and didn't get promoted. if (NewAI != &AI) Worklist.insert(NewAI); // Drop any post-promotion work items if promotion didn't happen. while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); } return NewAI; } /// \brief Walks the slices of an alloca and form partitions based on them, /// rewriting each of their uses. bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { if (AS.begin() == AS.end()) return false; unsigned NumPartitions = 0; bool Changed = false; const DataLayout &DL = AI.getModule()->getDataLayout(); // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); // Now that we have identified any pre-splitting opportunities, mark any // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail // to split these during pre-splitting, we want to force them to be // rewritten into a partition. bool IsSorted = true; for (Slice &S : AS) { if (!S.isSplittable()) continue; // FIXME: We currently leave whole-alloca splittable loads and stores. This // used to be the only splittable loads and stores and we need to be // confident that the above handling of splittable loads and stores is // completely sufficient before we forcibly disable the remaining handling. if (S.beginOffset() == 0 && S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) continue; if (isa(S.getUse()->getUser()) || isa(S.getUse()->getUser())) { S.makeUnsplittable(); IsSorted = false; } } if (!IsSorted) std::sort(AS.begin(), AS.end()); /// \brief Describes the allocas introduced by rewritePartition /// in order to migrate the debug info. struct Piece { AllocaInst *Alloca; uint64_t Offset; uint64_t Size; Piece(AllocaInst *AI, uint64_t O, uint64_t S) : Alloca(AI), Offset(O), Size(S) {} }; SmallVector Pieces; // Rewrite each partition. for (auto &P : AS.partitions()) { if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { Changed = true; if (NewAI != &AI) { uint64_t SizeOfByte = 8; uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType()); // Don't include any padding. uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); } } ++NumPartitions; } NumAllocaPartitions += NumPartitions; MaxPartitionsPerAlloca = std::max(NumPartitions, MaxPartitionsPerAlloca); // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { auto *Var = DbgDecl->getVariable(); auto *Expr = DbgDecl->getExpression(); DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType()); for (auto Piece : Pieces) { // Create a piece expression describing the new partition or reuse AI's // expression if there is only one partition. auto *PieceExpr = Expr; if (Piece.Size < AllocaSize || Expr->isBitPiece()) { // If this alloca is already a scalar replacement of a larger aggregate, // Piece.Offset describes the offset inside the scalar. uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0; uint64_t Start = Offset + Piece.Offset; uint64_t Size = Piece.Size; if (Expr->isBitPiece()) { uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize(); if (Start >= AbsEnd) // No need to describe a SROAed padding. continue; Size = std::min(Size, AbsEnd - Start); } PieceExpr = DIB.createBitPieceExpression(Start, Size); } else { assert(Pieces.size() == 1 && "partition is as large as original alloca"); } // Remove any existing dbg.declare intrinsic describing the same alloca. if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca)) OldDDI->eraseFromParent(); DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(), &AI); } } return Changed; } /// \brief Clobber a use with undef, deleting the used value if it becomes dead. void SROA::clobberUse(Use &U) { Value *OldV = U; // Replace the use with an undef value. U = UndefValue::get(OldV->getType()); // Check for this making an instruction dead. We have to garbage collect // all the dead instructions to ensure the uses of any alloca end up being // minimal. if (Instruction *OldI = dyn_cast(OldV)) if (isInstructionTriviallyDead(OldI)) { DeadInsts.insert(OldI); } } /// \brief Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds /// the slices of the alloca, and then hands it off to be split and /// rewritten as needed. bool SROA::runOnAlloca(AllocaInst &AI) { DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); ++NumAllocasAnalyzed; // Special case dead allocas, as they're trivial. if (AI.use_empty()) { AI.eraseFromParent(); return true; } const DataLayout &DL = AI.getModule()->getDataLayout(); // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || DL.getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. AggLoadStoreRewriter AggRewriter; Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. AllocaSlices AS(DL, AI); DEBUG(AS.print(dbgs())); if (AS.isEscaped()) return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. for (Instruction *DeadUser : AS.getDeadUsers()) { // Free up everything used by this instruction. for (Use &DeadOp : DeadUser->operands()) clobberUse(DeadOp); // Now replace the uses of this instruction. DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType())); // And mark it for deletion. DeadInsts.insert(DeadUser); Changed = true; } for (Use *DeadOp : AS.getDeadOperands()) { clobberUse(*DeadOp); Changed = true; } // No slices to split. Leave the dead alloca for a later pass to clean up. if (AS.begin() == AS.end()) return Changed; Changed |= splitAlloca(AI, AS); DEBUG(dbgs() << " Speculating PHIs\n"); while (!SpeculatablePHIs.empty()) speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); DEBUG(dbgs() << " Speculating Selects\n"); while (!SpeculatableSelects.empty()) speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); return Changed; } /// \brief Delete the dead instructions accumulated in this run. /// /// Recursively deletes the dead instructions we've accumulated. This is done /// at the very end to maximize locality of the recursive delete and to /// minimize the problems of invalidated instruction pointers as such pointers /// are used heavily in the intermediate stages of the algorithm. /// /// We also record the alloca instructions deleted here so that they aren't /// subsequently handed to mem2reg to promote. void SROA::deleteDeadInstructions( SmallPtrSetImpl &DeletedAllocas) { while (!DeadInsts.empty()) { Instruction *I = DeadInsts.pop_back_val(); DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); I->replaceAllUsesWith(UndefValue::get(I->getType())); for (Use &Operand : I->operands()) if (Instruction *U = dyn_cast(Operand)) { // Zero out the operand and see if it becomes trivially dead. Operand = nullptr; if (isInstructionTriviallyDead(U)) DeadInsts.insert(U); } if (AllocaInst *AI = dyn_cast(I)) { DeletedAllocas.insert(AI); if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) DbgDecl->eraseFromParent(); } ++NumDeleted; I->eraseFromParent(); } } /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. /// This function returns whether any promotion occurred. bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; NumPromoted += PromotableAllocas.size(); DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, AssumptionCache &RunAC) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); I != E; ++I) { if (AllocaInst *AI = dyn_cast(I)) Worklist.insert(AI); } bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from // the list of promotable allocas. SmallPtrSet DeletedAllocas; do { while (!Worklist.empty()) { Changed |= runOnAlloca(*Worklist.pop_back_val()); deleteDeadInstructions(DeletedAllocas); // Remove the deleted allocas from various lists so that we don't try to // continue processing them. if (!DeletedAllocas.empty()) { auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; Worklist.remove_if(IsInSet); PostPromotionWorklist.remove_if(IsInSet); PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), PromotableAllocas.end(), IsInSet), PromotableAllocas.end()); DeletedAllocas.clear(); } } Changed |= promoteAllocas(F); Worklist = PostPromotionWorklist; PostPromotionWorklist.clear(); } while (!Worklist.empty()); // FIXME: Even when promoting allocas we should preserve some abstract set of // CFG-specific analyses. return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } PreservedAnalyses SROA::run(Function &F, AnalysisManager *AM) { return runImpl(F, AM->getResult(F), AM->getResult(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. /// /// This is in the llvm namespace purely to allow it to be a friend of the \c /// SROA pass. class llvm::sroa::SROALegacyPass : public FunctionPass { /// The SROA implementation. SROA Impl; public: SROALegacyPass() : FunctionPass(ID) { initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { if (skipOptnoneFunction(F)) return false; auto PA = Impl.runImpl( F, getAnalysis().getDomTree(), getAnalysis().getAssumptionCache(F)); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } const char *getPassName() const override { return "SROA"; } static char ID; }; char SROALegacyPass::ID = 0; FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) diff --git a/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 0fc6953350b8..c89c763138f9 100644 --- a/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -1,2630 +1,2625 @@ //===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This transformation implements the well known scalar replacement of // aggregates transformation. This xform breaks up alloca instructions of // aggregate type (structure or array) into individual alloca instructions for // each member (if possible). Then, if possible, it transforms the individual // alloca instructions into nice clean scalar SSA form. // // This combines a simple SRoA algorithm with the Mem2Reg algorithm because they // often interact, especially for C++ programs. As such, iterating between // SRoA, then Mem2Reg until we run out of things to promote works well. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DIBuilder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; #define DEBUG_TYPE "scalarrepl" STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); namespace { #define SROA SROA_ struct SROA : public FunctionPass { SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { if (T == -1) SRThreshold = 128; else SRThreshold = T; if (ST == -1) StructMemberThreshold = 32; else StructMemberThreshold = ST; if (AT == -1) ArrayElementThreshold = 8; else ArrayElementThreshold = AT; if (SLT == -1) // Do not limit the scalar integer load size if no threshold is given. ScalarLoadThreshold = -1; else ScalarLoadThreshold = SLT; } bool runOnFunction(Function &F) override; bool performScalarRepl(Function &F); bool performPromotion(Function &F); private: bool HasDomTree; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. SmallVector DeadInsts; /// AllocaInfo - When analyzing uses of an alloca instruction, this captures /// information about the uses. All these fields are initialized to false /// and set to true when something is learned. struct AllocaInfo { /// The alloca to promote. AllocaInst *AI; /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite /// looping and avoid redundant work. SmallPtrSet CheckedPHIs; /// isUnsafe - This is set to true if the alloca cannot be SROA'd. bool isUnsafe : 1; /// isMemCpySrc - This is true if this aggregate is memcpy'd from. bool isMemCpySrc : 1; /// isMemCpyDst - This is true if this aggregate is memcpy'd into. bool isMemCpyDst : 1; /// hasSubelementAccess - This is true if a subelement of the alloca is /// ever accessed, or false if the alloca is only accessed with mem /// intrinsics or load/store that only access the entire alloca at once. bool hasSubelementAccess : 1; /// hasALoadOrStore - This is true if there are any loads or stores to it. /// The alloca may just be accessed with memcpy, for example, which would /// not set this. bool hasALoadOrStore : 1; explicit AllocaInfo(AllocaInst *ai) : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), hasSubelementAccess(false), hasALoadOrStore(false) {} }; /// SRThreshold - The maximum alloca size to considered for SROA. unsigned SRThreshold; /// StructMemberThreshold - The maximum number of members a struct can /// contain to be considered for SROA. unsigned StructMemberThreshold; /// ArrayElementThreshold - The maximum number of elements an array can /// have to be considered for SROA. unsigned ArrayElementThreshold; /// ScalarLoadThreshold - The maximum size in bits of scalars to load when /// converting to scalar unsigned ScalarLoadThreshold; void MarkUnsafe(AllocaInfo &I, Instruction *User) { I.isUnsafe = true; DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); } bool isSafeAllocaToScalarRepl(AllocaInst *AI); void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, AllocaInfo &Info); void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess); bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, const DataLayout &DL); uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, const DataLayout &DL); void DoScalarReplacement(AllocaInst *AI, std::vector &WorkList); void DeleteDeadInstructions(); void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVectorImpl &NewElts); void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, SmallVectorImpl &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVectorImpl &NewElts); bool ShouldAttemptScalarRepl(AllocaInst *AI); }; // SROA_DT - SROA that uses DominatorTree. struct SROA_DT : public SROA { static char ID; public: SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : SROA(T, true, ID, ST, AT, SLT) { initializeSROA_DTPass(*PassRegistry::getPassRegistry()); } // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); } }; // SROA_SSAUp - SROA that uses SSAUpdater. struct SROA_SSAUp : public SROA { static char ID; public: SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : SROA(T, false, ID, ST, AT, SLT) { initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); } // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); } }; } char SROA_DT::ID = 0; char SROA_SSAUp::ID = 0; INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA_DT, "scalarrepl", "Scalar Replacement of Aggregates (DT)", false, false) INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", "Scalar Replacement of Aggregates (SSAUp)", false, false) // Public interface to the ScalarReplAggregates pass FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, bool UseDomTree, int StructMemberThreshold, int ArrayElementThreshold, int ScalarLoadThreshold) { if (UseDomTree) return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, ScalarLoadThreshold); return new SROA_SSAUp(Threshold, StructMemberThreshold, ArrayElementThreshold, ScalarLoadThreshold); } //===----------------------------------------------------------------------===// // Convert To Scalar Optimization. //===----------------------------------------------------------------------===// namespace { /// ConvertToScalarInfo - This class implements the "Convert To Scalar" /// optimization, which scans the uses of an alloca and determines if it can /// rewrite it in terms of a single new alloca that can be mem2reg'd. class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; const DataLayout &DL; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object /// which means that mem2reg can't promote it. bool IsNotTrivial; /// ScalarKind - Tracks the kind of alloca being considered for promotion, /// computed based on the uses of the alloca rather than the LLVM type system. enum { Unknown, // Accesses via GEPs that are consistent with element access of a vector // type. This will not be converted into a vector unless there is a later // access using an actual vector type. ImplicitVector, // Accesses via vector operations and GEPs that are consistent with the // layout of a vector type. Vector, // An integer bag-of-bits with bitwise operations for insertion and // extraction. Any combination of types can be converted into this kind // of scalar. Integer } ScalarKind; /// VectorTy - This tracks the type that we should promote the vector to if /// it is possible to turn it into a vector. This starts out null, and if it /// isn't possible to turn into a vector type, it gets set to VoidTy. VectorType *VectorTy; /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into /// large integers unless there is some potential for optimization. bool HadNonMemTransferAccess; /// HadDynamicAccess - True if some element of this alloca was dynamic. /// We don't yet have support for turning a dynamic access into a large /// integer. bool HadDynamicAccess; public: explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL, unsigned SLT) : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false), ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false), HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); private: bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value *NonConstantIdx); Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder); }; } // end anonymous namespace. /// TryConvert - Analyze the specified alloca, and if it is safe to do so, /// rewrite it to be a new alloca which is mem2reg'able. This returns the new /// alloca if possible or null if not. AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial) return nullptr; // If an alloca has only memset / memcpy uses, it may still have an Unknown // ScalarKind. Treat it as an Integer below. if (ScalarKind == Unknown) ScalarKind = Integer; if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) ScalarKind = Integer; // If we were able to find a vector type that can handle this with // insert/extract elements, and if there was at least one use that had // a vector type, promote this to a vector. We don't want to promote // random stuff that doesn't use vectors (e.g. <9 x double>) because then // we just get a lot of insert/extracts. If at least one vector is // involved, then we probably really do have a union of vector/array. Type *NewTy; if (ScalarKind == Vector) { assert(VectorTy && "Missing type for vector scalar."); DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " << *VectorTy << '\n'); NewTy = VectorTy; // Use the vector type. } else { unsigned BitWidth = AllocaSize * 8; // Do not convert to scalar integer if the alloca size exceeds the // scalar load threshold. if (BitWidth > ScalarLoadThreshold) return nullptr; if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth)) return nullptr; // Dynamic accesses on integers aren't yet supported. They need us to shift // by a dynamic amount which could be difficult to work out as we might not // know whether to use a left or right shift. if (ScalarKind == Integer && HadDynamicAccess) return nullptr; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } /// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type /// (VectorTy) so far at the offset specified by Offset (which is specified in /// bytes). /// /// There are two cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. /// Here we turn element accesses into insert/extract element operations. /// This promotes a <4 x float> with a store of float to the third element /// into a <4 x float> that uses insert element. /// 2) A fully general blob of memory, which we turn into some (potentially /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, uint64_t Offset) { // If we already decided to turn this into a blob of integer memory, there is // nothing to be done. if (ScalarKind == Integer) return; // If this could be contributing to a vector, analyze it. // If the In type is a vector that is the same size as the alloca, see if it // matches the existing VecTy. if (VectorType *VInTy = dyn_cast(In)) { if (MergeInVectorType(VInTy, Offset)) return; } else if (In->isFloatTy() || In->isDoubleTy() || (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && isPowerOf2_32(In->getPrimitiveSizeInBits()))) { // Full width accesses can be ignored, because they can always be turned // into bitcasts. unsigned EltSize = In->getPrimitiveSizeInBits()/8; if (EltSize == AllocaSize) return; // If we're accessing something that could be an element of a vector, see // if the implied vector agrees with what we already have and if Offset is // compatible with it. if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && (!VectorTy || EltSize == VectorTy->getElementType() ->getPrimitiveSizeInBits()/8)) { if (!VectorTy) { ScalarKind = ImplicitVector; VectorTy = VectorType::get(In, AllocaSize/EltSize); } return; } } // Otherwise, we have a case that we can't handle with an optimized vector // form. We can still turn this into a large integer. ScalarKind = Integer; } /// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, /// returning true if the type was successfully merged and false otherwise. bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, uint64_t Offset) { if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { // If we're storing/loading a vector of the right size, allow it as a // vector. If this the first vector we see, remember the type so that // we know the element size. If this is a subsequent access, ignore it // even if it is a differing type but the same size. Worst case we can // bitcast the resultant vectors. if (!VectorTy) VectorTy = VInTy; ScalarKind = Vector; return true; } return false; } /// CanConvertToScalar - V is a pointer. If we can convert the pointee and all /// its accesses to a single vector type, return true and set VecTy to /// the new type. If we could convert the alloca into a single promotable /// integer, return true but set VecTy to VoidTy. Further, if the use is not a /// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset /// is the current offset from the base of the alloca being analyzed. /// /// If we see at least one access to the value that is as a vector type, set the /// SawVec flag. bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx) { for (User *U : V->users()) { Instruction *UI = cast(U); if (LoadInst *LI = dyn_cast(UI)) { // Don't break volatile loads. if (!LI->isSimple()) return false; // Don't touch MMX operations. if (LI->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; MergeInTypeForLoadOrStore(LI->getType(), Offset); continue; } if (StoreInst *SI = dyn_cast(UI)) { // Storing the pointer, not into the value? if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. if (SI->getOperand(0)->getType()->isX86_MMXTy()) return false; HadNonMemTransferAccess = true; MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset); continue; } if (BitCastInst *BCI = dyn_cast(UI)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) return false; continue; } if (GetElementPtrInst *GEP = dyn_cast(UI)) { // If this is a GEP with a variable indices, we can't handle it. PointerType* PtrTy = dyn_cast(GEP->getPointerOperandType()); if (!PtrTy) return false; // Compute the offset that this GEP adds to the pointer. SmallVector Indices(GEP->op_begin()+1, GEP->op_end()); Value *GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { if (!isa(PtrTy->getElementType())) return false; if (NonConstantIdx) return false; GEPNonConstantIdx = Indices.pop_back_val(); if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) return false; HadDynamicAccess = true; } else GEPNonConstantIdx = NonConstantIdx; uint64_t GEPOffset = DL.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) return false; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; continue; } // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. if (MemSetInst *MSI = dyn_cast(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; // Store of constant value. if (!isa(MSI->getValue())) return false; // Store of constant size. ConstantInt *Len = dyn_cast(MSI->getLength()); if (!Len) return false; // If the size differs from the alloca, we can only convert the alloca to // an integer bag-of-bits. // FIXME: This should handle all of the cases that are currently accepted // as vector element insertions. if (Len->getZExtValue() != AllocaSize || Offset != 0) ScalarKind = Integer; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; continue; } // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast(UI)) { // Store to dynamic index. if (NonConstantIdx) return false; ConstantInt *Len = dyn_cast(MTI->getLength()); if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0) return false; IsNotTrivial = true; // Can't be mem2reg'd. continue; } // If this is a lifetime intrinsic, we can handle it. if (IntrinsicInst *II = dyn_cast(UI)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { continue; } } // Otherwise, we cannot handle this! return false; } return true; } /// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca /// directly. This happens when we are converting an "integer union" to a /// single integer scalar, or when we are converting a "vector union" to a /// vector with insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. By the end of this, there should be no uses of Ptr. void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, Value* NonConstantIdx) { while (!Ptr->use_empty()) { Instruction *User = cast(Ptr->user_back()); if (BitCastInst *CI = dyn_cast(User)) { ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); CI->eraseFromParent(); continue; } if (GetElementPtrInst *GEP = dyn_cast(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector Indices(GEP->op_begin()+1, GEP->op_end()); Value* GEPNonConstantIdx = nullptr; if (!GEP->hasAllConstantIndices()) { assert(!NonConstantIdx && "Dynamic GEP reading from dynamic GEP unsupported"); GEPNonConstantIdx = Indices.pop_back_val(); } else GEPNonConstantIdx = NonConstantIdx; uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(), Indices); ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); GEP->eraseFromParent(); continue; } IRBuilder<> Builder(User); if (LoadInst *LI = dyn_cast(User)) { // The load is a bit extract from NewAI shifted right by Offset bits. Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, NonConstantIdx, Builder); LI->replaceAllUsesWith(NewLoadVal); LI->eraseFromParent(); continue; } if (StoreInst *SI = dyn_cast(User)) { assert(SI->getOperand(0) != Ptr && "Consistency error!"); Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, NonConstantIdx, Builder); Builder.CreateStore(New, NewAI); SI->eraseFromParent(); // If the load we just inserted is now dead, then the inserted store // overwrote the entire thing. if (Old->use_empty()) Old->eraseFromParent(); continue; } // If this is a constant sized memset of a constant value (e.g. 0) we can // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast(User)) { assert(MSI->getRawDest() == Ptr && "Consistency error!"); assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); int64_t SNumBytes = cast(MSI->getLength())->getSExtValue(); if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { unsigned NumBytes = static_cast(SNumBytes); unsigned Val = cast(MSI->getValue())->getZExtValue(); // Compute the value replicated the right number of times. APInt APVal(NumBytes*8, Val); // Splat the value if non-zero. if (Val) for (unsigned i = 1; i != NumBytes; ++i) APVal |= APVal << 8; Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), Old, Offset, nullptr, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote // the entire thing. if (Old->use_empty()) Old->eraseFromParent(); } MSI->eraseFromParent(); continue; } // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast(User)) { assert(Offset == 0 && "must be store to start of alloca"); assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. AllocaInst *OrigAI = cast(GetUnderlyingObject(Ptr, DL, 0)); if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); Value *SrcPtr = MTI->getSource(); PointerType* SPTy = cast(SrcPtr->getType()); PointerType* AIPTy = cast(NewAI->getType()); if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { AIPTy = PointerType::get(AIPTy->getElementType(), SPTy->getAddressSpace()); } SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); PointerType* DPTy = cast(MTI->getDest()->getType()); PointerType* AIPTy = cast(NewAI->getType()); if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { AIPTy = PointerType::get(AIPTy->getElementType(), DPTy->getAddressSpace()); } Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); NewStore->setAlignment(MTI->getAlignment()); } else { // Noop transfer. Src == Dst } MTI->eraseFromParent(); continue; } if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { // There's no need to preserve these, as the resulting alloca will be // converted to a register anyways. II->eraseFromParent(); continue; } } llvm_unreachable("Unsupported operation!"); } } /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer /// or vector value FromVal, extracting the bits from the offset specified by /// Offset. This returns the value, which is of type ToType. /// /// This happens when we are converting an "integer union" to a single /// integer scalar, or when we are converting a "vector union" to a vector with /// insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. Value *ConvertToScalarInfo:: ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. Type *FromType = FromVal->getType(); if (FromType == ToType && Offset == 0) return FromVal; // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. if (VectorType *VTy = dyn_cast(FromType)) { unsigned FromTypeSize = DL.getTypeAllocSize(FromType); unsigned ToTypeSize = DL.getTypeAllocSize(ToType); if (FromTypeSize == ToTypeSize) return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; if (Offset) { unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); Elt = Offset/EltSize; assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. Value *Idx; if (NonConstantIdx) { if (Elt) Idx = Builder.CreateAdd(NonConstantIdx, Builder.getInt32(Elt), "dyn.offset"); else Idx = NonConstantIdx; } else Idx = Builder.getInt32(Elt); Value *V = Builder.CreateExtractElement(FromVal, Idx); if (V->getType() != ToType) V = Builder.CreateBitCast(V, ToType); return V; } // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. if (StructType *ST = dyn_cast(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *DL.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } if (ArrayType *AT = dyn_cast(ToType)) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), Offset+i*EltSize, nullptr, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } // Otherwise, this must be a union that was converted to an integer value. IntegerType *NTy = cast(FromVal->getType()); // If this is a big-endian system and the load is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. ShAmt = DL.getTypeStoreSizeInBits(NTy) - DL.getTypeStoreSizeInBits(ToType) - Offset; } else { ShAmt = Offset; } // Note: we support negative bitwidths (with shl) which are not defined. // We do this to support (f.e.) loads off the end of a structure where // only some bits are used. if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateLShr(FromVal, ConstantInt::get(FromVal->getType(), ShAmt)); else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateShl(FromVal, ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. unsigned LIBitWidth = DL.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth)); else if (LIBitWidth > NTy->getBitWidth()) FromVal = Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth)); // If the result is an integer, this is a trunc or bitcast. if (ToType->isIntegerTy()) { // Should be done. } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { // Just do a bitcast, we know the sizes match up. FromVal = Builder.CreateBitCast(FromVal, ToType); } else { // Otherwise must be a pointer. FromVal = Builder.CreateIntToPtr(FromVal, ToType); } assert(FromVal->getType() == ToType && "Didn't convert right?"); return FromVal; } /// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer /// or vector value "Old" at the offset specified by Offset. /// /// This happens when we are converting an "integer union" to a /// single integer scalar, or when we are converting a "vector union" to a /// vector with insert/extractelement instructions. /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. /// /// NonConstantIdx is an index value if there was a GEP with a non-constant /// index value. If this is 0 then all GEPs used to find this insert address /// are constant. Value *ConvertToScalarInfo:: ConvertScalar_InsertValue(Value *SV, Value *Old, uint64_t Offset, Value* NonConstantIdx, IRBuilder<> &Builder) { // Convert the stored type to the actual type, shift it left to insert // then 'or' into place. Type *AllocaType = Old->getType(); LLVMContext &Context = Old->getContext(); if (VectorType *VTy = dyn_cast(AllocaType)) { uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy); uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType()); // Changing the whole vector with memset or with an access of a different // vector type? if (ValSize == VecSize) return Builder.CreateBitCast(SV, AllocaType); // Must be an element insertion. Type *EltTy = VTy->getElementType(); if (SV->getType() != EltTy) SV = Builder.CreateBitCast(SV, EltTy); uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; Value *Idx; if (NonConstantIdx) { if (Elt) Idx = Builder.CreateAdd(NonConstantIdx, Builder.getInt32(Elt), "dyn.offset"); else Idx = NonConstantIdx; } else Idx = Builder.getInt32(Elt); return Builder.CreateInsertElement(Old, SV, Idx); } // If SV is a first-class aggregate value, insert each value recursively. if (StructType *ST = dyn_cast(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *DL.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), nullptr, Builder); } return Old; } if (ArrayType *AT = dyn_cast(SV->getType())) { assert(!NonConstantIdx && "Dynamic indexing into array types not supported"); uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr, Builder); } return Old; } // If SV is a float, convert it to the appropriate integer type. // If it is a pointer, do the same. unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType()); unsigned DestWidth = DL.getTypeSizeInBits(AllocaType); unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType()); unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { if (SV->getType()->getPrimitiveSizeInBits() < AllocaType->getPrimitiveSizeInBits()) SV = Builder.CreateZExt(SV, AllocaType); else { // Truncation may be needed if storing more than the alloca can hold // (undefined behavior). SV = Builder.CreateTrunc(SV, AllocaType); SrcWidth = DestWidth; SrcStoreWidth = DestStoreWidth; } } // If this is a big-endian system and the store is narrower than the // full alloca type, we need to do a shift to get the right bits. int ShAmt = 0; if (DL.isBigEndian()) { // On big-endian machines, the lowest bit is stored at the bit offset // from the pointer given by getTypeStoreSizeInBits. This matters for // integers with a bitwidth that is not a multiple of 8. ShAmt = DestStoreWidth - SrcStoreWidth - Offset; } else { ShAmt = Offset; } // Note: we support negative bitwidths (with shr) which are not defined. // We do this to support (f.e.) stores off the end of a structure where // only some bits in the structure are set. APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); Mask <<= ShAmt; } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); Mask = Mask.lshr(-ShAmt); } // Mask out the bits we are about to insert from the old value, and or // in the new bits. if (SrcWidth != DestWidth) { assert(DestWidth > SrcWidth); Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); SV = Builder.CreateOr(Old, SV, "ins"); } return SV; } //===----------------------------------------------------------------------===// // SRoA Driver //===----------------------------------------------------------------------===// bool SROA::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; bool Changed = performPromotion(F); while (1) { bool LocalChange = performScalarRepl(F); if (!LocalChange) break; // No need to repromote if no scalarrepl Changed = true; LocalChange = performPromotion(F); if (!LocalChange) break; // No need to re-scalarrepl if no promotion } return Changed; } namespace { class AllocaPromoter : public LoadAndStorePromoter { AllocaInst *AI; DIBuilder *DIB; SmallVector DDIs; SmallVector DVIs; public: AllocaPromoter(ArrayRef Insts, SSAUpdater &S, DIBuilder *DB) : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {} void run(AllocaInst *AI, const SmallVectorImpl &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; if (auto *L = LocalAsMetadata::getIfExists(AI)) { if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) { for (User *U : DINode->users()) if (DbgDeclareInst *DDI = dyn_cast(U)) DDIs.push_back(DDI); else if (DbgValueInst *DVI = dyn_cast(U)) DVIs.push_back(DVI); } } LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); for (SmallVectorImpl::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } for (SmallVectorImpl::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); } } bool isInstInList(Instruction *I, const SmallVectorImpl &Insts) const override { if (LoadInst *LI = dyn_cast(I)) return LI->getOperand(0) == AI; return cast(I)->getPointerOperand() == AI; } void updateDebugInfo(Instruction *Inst) const override { for (SmallVectorImpl::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast(Inst)) ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); else if (LoadInst *LI = dyn_cast(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } for (SmallVectorImpl::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = nullptr; if (StoreInst *SI = dyn_cast(Inst)) { // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. if (ZExtInst *ZExt = dyn_cast(SI->getOperand(0))) Arg = dyn_cast(ZExt->getOperand(0)); if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) Arg = dyn_cast(SExt->getOperand(0)); if (!Arg) Arg = SI->getOperand(0); } else if (LoadInst *LI = dyn_cast(Inst)) { Arg = LI->getOperand(0); } else { continue; } DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), DVI->getExpression(), DVI->getDebugLoc(), Inst); } } }; } // end anon namespace /// isSafeSelectToSpeculate - Select instructions that use an alloca and are /// subsequently loaded can be rewritten to load both input pointers and then /// select between the result, allowing the load of the alloca to be promoted. /// From this: /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// %V2 = load i32* %Other /// %V = select i1 %cond, i32 %V1, i32 %V2 /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafeSelectToSpeculate(SelectInst *SI) { const DataLayout &DL = SI->getModule()->getDataLayout(); - bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL); - bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL); for (User *U : SI->users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. - if (!TDerefable && - !isSafeToLoadUnconditionally(SI->getTrueValue(), LI->getAlignment(), + if (!isSafeToLoadUnconditionally(SI->getTrueValue(), LI->getAlignment(), LI)) return false; - if (!FDerefable && - !isSafeToLoadUnconditionally(SI->getFalseValue(), LI->getAlignment(), + if (!isSafeToLoadUnconditionally(SI->getFalseValue(), LI->getAlignment(), LI)) return false; } return true; } /// isSafePHIToSpeculate - PHI instructions that use an alloca and are /// subsequently loaded can be rewritten to load both input pointers in the pred /// blocks and then PHI the results, allowing the load of the alloca to be /// promoted. /// From this: /// %P2 = phi [i32* %Alloca, i32* %Other] /// %V = load i32* %P2 /// to: /// %V1 = load i32* %Alloca -> will be mem2reg'd /// ... /// %V2 = load i32* %Other /// ... /// %V = phi [i32 %V1, i32 %V2] /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. static bool isSafePHIToSpeculate(PHINode *PN) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN->getParent(); unsigned MaxAlign = 0; for (User *U : PN->users()) { LoadInst *LI = dyn_cast(U); if (!LI || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. if (LI->getParent() != BB) return false; // Ensure that there are no instructions between the PHI and the load that // could store. for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; MaxAlign = std::max(MaxAlign, LI->getAlignment()); } const DataLayout &DL = PN->getModule()->getDataLayout(); // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly // trapping load in the predecessor if it is a critical edge. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); Value *InVal = PN->getIncomingValue(i); // If the terminator of the predecessor has side-effects (an invoke), // there is no safe place to put a load in the predecessor. if (Pred->getTerminator()->mayHaveSideEffects()) return false; // If the value is produced by the terminator of the predecessor // (an invoke), there is no valid place to put a load in the predecessor. if (Pred->getTerminator() == InVal) return false; // If the predecessor has a single successor, then the edge isn't critical. if (Pred->getTerminator()->getNumSuccessors() == 1) continue; // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. - if (isDereferenceablePointer(InVal, DL) || - isSafeToLoadUnconditionally(InVal, MaxAlign, Pred->getTerminator())) + if (isSafeToLoadUnconditionally(InVal, MaxAlign, Pred->getTerminator())) continue; return false; } return true; } /// tryToMakeAllocaBePromotable - This returns true if the alloca only has /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) { SetVector, SmallPtrSet > InstsToRewrite; for (User *U : AI->users()) { if (LoadInst *LI = dyn_cast(U)) { if (!LI->isSimple()) return false; continue; } if (StoreInst *SI = dyn_cast(U)) { if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. continue; } if (SelectInst *SI = dyn_cast(U)) { // If the condition being selected on is a constant, fold the select, yes // this does (rarely) happen early on. if (ConstantInt *CI = dyn_cast(SI->getCondition())) { Value *Result = SI->getOperand(1+CI->isZero()); SI->replaceAllUsesWith(Result); SI->eraseFromParent(); // This is very rare and we just scrambled the use list of AI, start // over completely. return tryToMakeAllocaBePromotable(AI, DL); } // If it is safe to turn "load (select c, AI, ptr)" into a select of two // loads, then we can transform this by rewriting the select. if (!isSafeSelectToSpeculate(SI)) return false; InstsToRewrite.insert(SI); continue; } if (PHINode *PN = dyn_cast(U)) { if (PN->use_empty()) { // Dead PHIs can be stripped. InstsToRewrite.insert(PN); continue; } // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. if (!isSafePHIToSpeculate(PN)) return false; InstsToRewrite.insert(PN); continue; } if (BitCastInst *BCI = dyn_cast(U)) { if (onlyUsedByLifetimeMarkers(BCI)) { InstsToRewrite.insert(BCI); continue; } } return false; } // If there are no instructions to rewrite, then all uses are load/stores and // we're done! if (InstsToRewrite.empty()) return true; // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { if (BitCastInst *BCI = dyn_cast(InstsToRewrite[i])) { // This could only be a bitcast used by nothing but lifetime intrinsics. for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end(); I != E;) cast(*I++)->eraseFromParent(); BCI->eraseFromParent(); continue; } if (SelectInst *SI = dyn_cast(InstsToRewrite[i])) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. while (!SI->use_empty()) { LoadInst *LI = cast(SI->user_back()); IRBuilder<> Builder(LI); LoadInst *TrueLoad = Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); LoadInst *FalseLoad = Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); // Transfer alignment and AA info if present. TrueLoad->setAlignment(LI->getAlignment()); FalseLoad->setAlignment(LI->getAlignment()); AAMDNodes Tags; LI->getAAMetadata(Tags); if (Tags) { TrueLoad->setAAMetadata(Tags); FalseLoad->setAAMetadata(Tags); } Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); V->takeName(LI); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } // Now that all the loads are gone, the select is gone too. SI->eraseFromParent(); continue; } // Otherwise, we have a PHI node which allows us to push the loads into the // predecessors. PHINode *PN = cast(InstsToRewrite[i]); if (PN->use_empty()) { PN->eraseFromParent(); continue; } Type *LoadTy = cast(PN->getType())->getElementType(); PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); // Get the AA tags and alignment to use from one of the loads. It doesn't // matter which one we get and if any differ, it doesn't matter. LoadInst *SomeLoad = cast(PN->user_back()); AAMDNodes AATags; SomeLoad->getAAMetadata(AATags); unsigned Align = SomeLoad->getAlignment(); // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { LoadInst *LI = cast(PN->user_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } // Inject loads into all of the pred blocks. Keep track of which blocks we // insert them into in case we have multiple edges from the same block. DenseMap InsertedLoads; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; if (!Load) { Load = new LoadInst(PN->getIncomingValue(i), PN->getName() + "." + Pred->getName(), Pred->getTerminator()); Load->setAlignment(Align); if (AATags) Load->setAAMetadata(AATags); } NewPN->addIncoming(Load, Pred); } PN->eraseFromParent(); } ++NumAdjusted; return true; } bool SROA::performPromotion(Function &F) { std::vector Allocas; const DataLayout &DL = F.getParent()->getDataLayout(); DominatorTree *DT = nullptr; if (HasDomTree) DT = &getAnalysis().getDomTree(); AssumptionCache &AC = getAnalysis().getAssumptionCache(F); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); bool Changed = false; SmallVector Insts; while (1) { Allocas.clear(); // Find allocas that are safe to promote, by looking at all instructions in // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? if (tryToMakeAllocaBePromotable(AI, DL)) Allocas.push_back(AI); if (Allocas.empty()) break; if (HasDomTree) PromoteMemToReg(Allocas, *DT, nullptr, &AC); else { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { AllocaInst *AI = Allocas[i]; // Build list of instructions to promote. for (User *U : AI->users()) Insts.push_back(cast(U)); AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); Insts.clear(); } } NumPromoted += Allocas.size(); Changed = true; } return Changed; } /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { Type *T = AI->getAllocatedType(); // Do not promote any struct that has too many members. if (StructType *ST = dyn_cast(T)) return ST->getNumElements() <= StructMemberThreshold; // Do not promote any array that has too many elements. if (ArrayType *AT = dyn_cast(T)) return AT->getNumElements() <= ArrayElementThreshold; return false; } // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the alloca instructions in the entry block, removing // them if they are only used by getelementptr instructions. // bool SROA::performScalarRepl(Function &F) { std::vector WorkList; const DataLayout &DL = F.getParent()->getDataLayout(); // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) if (AllocaInst *A = dyn_cast(I)) WorkList.push_back(A); // Process the worklist bool Changed = false; while (!WorkList.empty()) { AllocaInst *AI = WorkList.back(); WorkList.pop_back(); // Handle dead allocas trivially. These can be formed by SROA'ing arrays // with unused elements. if (AI->use_empty()) { AI->eraseFromParent(); Changed = true; continue; } // If this alloca is impossible for us to promote, reject it early. if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) continue; // Check to see if we can perform the core SROA transformation. We cannot // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar // value cannot be decomposed at all. uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; // Do not promote any struct whose size is too big. if (AllocaSize > SRThreshold) continue; // If the alloca looks like a good candidate for scalar replacement, and if // all its users can be transformed, then split up the aggregate into its // separate elements. if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) { DoScalarReplacement(AI, WorkList); Changed = true; continue; } // If we can turn this aggregate value (potentially with casts) into a // simple scalar value that can be mem2reg'd into a register value. // IsNotTrivial tracks whether this is something that mem2reg could have // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. if (AllocaInst *NewAI = ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold) .TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; Changed = true; continue; } // Otherwise, couldn't process this alloca. } return Changed; } /// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl /// predicate, do SROA now. void SROA::DoScalarReplacement(AllocaInst *AI, std::vector &WorkList) { DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); SmallVector ElementAllocas; if (StructType *ST = dyn_cast(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing } } else { ArrayType *AT = cast(AI->getAllocatedType()); ElementAllocas.reserve(AT->getNumElements()); Type *ElTy = AT->getElementType(); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); WorkList.push_back(NA); // Add to worklist for recursive processing } } // Now that we have created the new alloca instructions, rewrite all the // uses of the old alloca. RewriteForScalarRepl(AI, AI, 0, ElementAllocas); // Now erase any instructions that were made dead while rewriting the alloca. DeleteDeadInstructions(); AI->eraseFromParent(); ++NumReplaced; } /// DeleteDeadInstructions - Erase instructions on the DeadInstrs list, /// recursively including all their operands that become trivially dead. void SROA::DeleteDeadInstructions() { while (!DeadInsts.empty()) { Instruction *I = cast(DeadInsts.pop_back_val()); for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) if (Instruction *U = dyn_cast(*OI)) { // Zero out the operand and see if it becomes trivially dead. // (But, don't add allocas to the dead instruction list -- they are // already on the worklist and will be deleted separately.) *OI = nullptr; if (isInstructionTriviallyDead(U) && !isa(U)) DeadInsts.push_back(U); } I->eraseFromParent(); } } /// isSafeForScalarRepl - Check if instruction I is a safe use with regard to /// performing scalar replacement of alloca AI. The results are flagged in /// the Info parameter. Offset indicates the position within AI that is /// referenced by this instruction. void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { const DataLayout &DL = I->getModule()->getDataLayout(); for (Use &U : I->uses()) { Instruction *User = cast(U.getUser()); if (BitCastInst *BC = dyn_cast(User)) { isSafeForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast(User)) { uint64_t GEPOffset = Offset; isSafeGEP(GEPI, GEPOffset, Info); if (!Info.isUnsafe) isSafeForScalarRepl(GEPI, GEPOffset, Info); } else if (MemIntrinsic *MI = dyn_cast(User)) { ConstantInt *Length = dyn_cast(MI->getLength()); if (!Length || Length->isNegative()) return MarkUnsafe(Info, User); isSafeMemAccess(Offset, Length->getZExtValue(), nullptr, U.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast(User)) { if (!LI->isSimple()) return MarkUnsafe(Info, User); Type *LIType = LI->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() != Intrinsic::lifetime_start && II->getIntrinsicID() != Intrinsic::lifetime_end) return MarkUnsafe(Info, User); } else if (isa(User) || isa(User)) { isSafePHISelectUseForScalarRepl(User, Offset, Info); } else { return MarkUnsafe(Info, User); } if (Info.isUnsafe) return; } } /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer /// derived from the alloca, we can often still split the alloca into elements. /// This is useful if we have a large alloca where one element is phi'd /// together somewhere: we can SRoA and promote all the other elements even if /// we end up not being able to promote this one. /// /// All we require is that the uses of the PHI do not index into other parts of /// the alloca. The most important use case for this is single load and stores /// that are PHI'd together, which can happen due to code sinking. void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { // If we've already checked this PHI, don't do it again. if (PHINode *PN = dyn_cast(I)) if (!Info.CheckedPHIs.insert(PN).second) return; const DataLayout &DL = I->getModule()->getDataLayout(); for (User *U : I->users()) { Instruction *UI = cast(U); if (BitCastInst *BC = dyn_cast(UI)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast(UI)) { // Only allow "bitcast" GEPs for simplicity. We could generalize this, // but would have to prove that we're staying inside of an element being // promoted. if (!GEPI->hasAllZeroIndices()) return MarkUnsafe(Info, UI); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); } else if (LoadInst *LI = dyn_cast(UI)) { if (!LI->isSimple()) return MarkUnsafe(Info, UI); Type *LIType = LI->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast(UI)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, UI); Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (isa(UI) || isa(UI)) { isSafePHISelectUseForScalarRepl(UI, Offset, Info); } else { return MarkUnsafe(Info, UI); } if (Info.isUnsafe) return; } } /// isSafeGEP - Check if a GEP instruction can be handled for scalar /// replacement. It is safe when all the indices are constant, in-bounds /// references, and when the resulting offset corresponds to an element within /// the alloca type. The results are flagged in the Info parameter. Upon /// return, Offset is adjusted as specified by the GEP indices. void SROA::isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info) { gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); if (GEPIt == E) return; bool NonConstant = false; unsigned NonConstantIdxSize = 0; // Walk through the GEP type indices, checking the types that this indexes // into. for (; GEPIt != E; ++GEPIt) { // Ignore struct elements, no extra checking needed for these. if ((*GEPIt)->isStructTy()) continue; ConstantInt *IdxVal = dyn_cast(GEPIt.getOperand()); if (!IdxVal) return MarkUnsafe(Info, GEPI); } // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If this GEP is non-constant then the last operand must have been a // dynamic index into a vector. Pop this now as it has no impact on the // constant part of the offset. if (NonConstant) Indices.pop_back(); const DataLayout &DL = GEPI->getModule()->getDataLayout(); Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize, DL)) MarkUnsafe(Info, GEPI); } /// isHomogeneousAggregate - Check if type T is a struct or array containing /// elements of the same type (which is always true for arrays). If so, /// return true with NumElts and EltTy set to the number of elements and the /// element type, respectively. static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, Type *&EltTy) { if (ArrayType *AT = dyn_cast(T)) { NumElts = AT->getNumElements(); EltTy = (NumElts == 0 ? nullptr : AT->getElementType()); return true; } if (StructType *ST = dyn_cast(T)) { NumElts = ST->getNumContainedTypes(); EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0)); for (unsigned n = 1; n < NumElts; ++n) { if (ST->getContainedType(n) != EltTy) return false; } return true; } return false; } /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are /// "homogeneous" aggregates with the same element type and number of elements. static bool isCompatibleAggregate(Type *T1, Type *T2) { if (T1 == T2) return true; unsigned NumElts1, NumElts2; Type *EltTy1, *EltTy2; if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && isHomogeneousAggregate(T2, NumElts2, EltTy2) && NumElts1 == NumElts2 && EltTy1 == EltTy2) return true; return false; } /// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI /// alloca or has an offset and size that corresponds to a component element /// within it. The offset checked here may have been formed from a GEP with a /// pointer bitcasted to a different type. /// /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a /// unit. If false, it only allows accesses known to be in a single element. void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess) { const DataLayout &DL = TheAccess->getModule()->getDataLayout(); // Check if this is a load/store of the entire alloca. if (Offset == 0 && AllowWholeAccess && MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) { // This can be safe for MemIntrinsics (where MemOpType is 0) and integer // loads/stores (which are essentially the same as the MemIntrinsics with // regard to copying padding between elements). But, if an alloca is // flagged as both a source and destination of such operations, we'll need // to check later for padding between elements. if (!MemOpType || MemOpType->isIntegerTy()) { if (isStore) Info.isMemCpyDst = true; else Info.isMemCpySrc = true; return; } // This is also safe for references using a type that is compatible with // the type of the alloca, so that loads/stores can be rewritten using // insertvalue/extractvalue. if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { Info.hasSubelementAccess = true; return; } } // Check if the offset/size correspond to a component within the alloca type. Type *T = Info.AI->getAllocatedType(); if (TypeHasComponent(T, Offset, MemSize, DL)) { Info.hasSubelementAccess = true; return; } return MarkUnsafe(Info, TheAccess); } /// TypeHasComponent - Return true if T has a component type with the /// specified offset and size. If Size is zero, do not check the size. bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size, const DataLayout &DL) { Type *EltTy; uint64_t EltSize; if (StructType *ST = dyn_cast(T)) { const StructLayout *Layout = DL.getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); EltSize = DL.getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); } else if (ArrayType *AT = dyn_cast(T)) { EltTy = AT->getElementType(); EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; } else if (VectorType *VT = dyn_cast(T)) { EltTy = VT->getElementType(); EltSize = DL.getTypeAllocSize(EltTy); if (Offset >= VT->getNumElements() * EltSize) return false; Offset %= EltSize; } else { return false; } if (Offset == 0 && (Size == 0 || EltSize == Size)) return true; // Check if the component spans multiple elements. if (Offset + Size > EltSize) return false; return TypeHasComponent(EltTy, Offset, Size, DL); } /// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite /// the instruction I, which references it, to use the separate elements. /// Offset indicates the position within AI that is referenced by this /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { const DataLayout &DL = I->getModule()->getDataLayout(); for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = *UI++; Instruction *User = cast(TheUse.getUser()); if (BitCastInst *BC = dyn_cast(User)) { RewriteBitCast(BC, AI, Offset, NewElts); continue; } if (GetElementPtrInst *GEPI = dyn_cast(User)) { RewriteGEP(GEPI, AI, Offset, NewElts); continue; } if (MemIntrinsic *MI = dyn_cast(User)) { ConstantInt *Length = dyn_cast(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType())) RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. continue; } if (IntrinsicInst *II = dyn_cast(User)) { if (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end) { RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); } continue; } if (LoadInst *LI = dyn_cast(User)) { Type *LIType = LI->getType(); if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: // %res = load { i32, i32 }* %alloc // with: // %load.0 = load i32* %alloc.0 // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 // %load.1 = load i32* %alloc.1 // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 // (Also works for arrays instead of structs) Value *Insert = UndefValue::get(LIType); IRBuilder<> Builder(LI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { Value *Load = Builder.CreateLoad(NewElts[i], "load"); Insert = Builder.CreateInsertValue(Insert, Load, i, "insert"); } LI->replaceAllUsesWith(Insert); DeadInsts.push_back(LI); } else if (LIType->isIntegerTy() && DL.getTypeAllocSize(LIType) == DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } continue; } if (StoreInst *SI = dyn_cast(User)) { Value *Val = SI->getOperand(0); Type *SIType = Val->getType(); if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { // Replace: // store { i32, i32 } %val, { i32, i32 }* %alloc // with: // %val.0 = extractvalue { i32, i32 } %val, 0 // store i32 %val.0, i32* %alloc.0 // %val.1 = extractvalue { i32, i32 } %val, 1 // store i32 %val.1, i32* %alloc.1 // (Also works for arrays instead of structs) IRBuilder<> Builder(SI); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName()); Builder.CreateStore(Extract, NewElts[i]); } DeadInsts.push_back(SI); } else if (SIType->isIntegerTy() && DL.getTypeAllocSize(SIType) == DL.getTypeAllocSize(AI->getAllocatedType())) { // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } continue; } if (isa(User) || isa(User)) { // If we have a PHI user of the alloca itself (as opposed to a GEP or // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to // the new pointer. if (!isa(I)) continue; assert(Offset == 0 && NewElts[0] && "Direct alloca use should have a zero offset"); // If we have a use of the alloca, we know the derived uses will be // utilizing just the first element of the scalarized result. Insert a // bitcast of the first alloca before the user as required. AllocaInst *NewAI = NewElts[0]; BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); NewAI->moveBefore(BCI); TheUse = BCI; continue; } } } /// RewriteBitCast - Update a bitcast reference to the alloca being replaced /// and recursively continue updating all of its uses. void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { RewriteForScalarRepl(BC, AI, Offset, NewElts); if (BC->getOperand(0) != AI) return; // The bitcast references the original alloca. Replace its uses with // references to the alloca containing offset zero (which is normally at // index zero, but might not be in cases involving structs with elements // of size zero). Type *T = AI->getAllocatedType(); uint64_t EltOffset = 0; Type *IdxTy; uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, BC->getModule()->getDataLayout()); Instruction *Val = NewElts[Idx]; if (Val->getType() != BC->getDestTy()) { Val = new BitCastInst(Val, BC->getDestTy(), "", BC); Val->takeName(BC); } BC->replaceAllUsesWith(Val); DeadInsts.push_back(BC); } /// FindElementAndOffset - Return the index of the element containing Offset /// within the specified type, which must be either a struct or an array. /// Sets T to the type of the element and Offset to the offset within that /// element. IdxTy is set to the type of the index result to be used in a /// GEP instruction. uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy, const DataLayout &DL) { uint64_t Idx = 0; if (StructType *ST = dyn_cast(T)) { const StructLayout *Layout = DL.getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); Offset -= Layout->getElementOffset(Idx); IdxTy = Type::getInt32Ty(T->getContext()); return Idx; } else if (ArrayType *AT = dyn_cast(T)) { T = AT->getElementType(); uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); return Idx; } VectorType *VT = cast(T); T = VT->getElementType(); uint64_t EltSize = DL.getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; IdxTy = Type::getInt64Ty(T->getContext()); return Idx; } /// RewriteGEP - Check if this GEP instruction moves the pointer across /// elements of the alloca that are being split apart, and if so, rewrite /// the GEP to be relative to the new element. void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { uint64_t OldOffset = Offset; const DataLayout &DL = GEPI->getModule()->getDataLayout(); SmallVector Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. // In this case, it must be the last GEP operand which is dynamic so keep that // aside until we've found the constant GEP offset then add it back in at the // end. Value* NonConstantIdx = nullptr; if (!GEPI->hasAllConstantIndices()) NonConstantIdx = Indices.pop_back_val(); Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); Type *T = AI->getAllocatedType(); Type *IdxTy; uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL); if (GEPI->getOperand(0) == AI) OldIdx = ~0ULL; // Force the GEP to be rewritten. T = AI->getAllocatedType(); uint64_t EltOffset = Offset; uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL); // If this GEP does not move the pointer across elements of the alloca // being split, then it does not needs to be rewritten. if (Idx == OldIdx) return; Type *i32Ty = Type::getInt32Ty(AI->getContext()); SmallVector NewArgs; NewArgs.push_back(Constant::getNullValue(i32Ty)); while (EltOffset != 0) { uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } if (NonConstantIdx) { Type* GepTy = T; // This GEP has a dynamic index. We need to add "i32 0" to index through // any structs or arrays in the original type until we get to the vector // to index. while (!isa(GepTy)) { NewArgs.push_back(Constant::getNullValue(i32Ty)); GepTy = cast(GepTy)->getTypeAtIndex(0U); } NewArgs.push_back(NonConstantIdx); } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); Val->takeName(GEPI); } if (Val->getType() != GEPI->getType()) Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI); GEPI->replaceAllUsesWith(Val); DeadInsts.push_back(GEPI); } /// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it /// to mark the lifetime of the scalarized memory. void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, SmallVectorImpl &NewElts) { ConstantInt *OldSize = cast(II->getArgOperand(0)); // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. Type *AIType = AI->getAllocatedType(); const DataLayout &DL = II->getModule()->getDataLayout(); uint64_t NewOffset = Offset; Type *IdxTy; uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL); IRBuilder<> Builder(II); uint64_t Size = OldSize->getLimitedValue(); if (NewOffset) { // Splice the first element and index 'NewOffset' bytes in. SROA will // split the alloca again later. unsigned AS = AI->getType()->getAddressSpace(); Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS)); V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset)); IdxTy = NewElts[Idx]->getAllocatedType(); uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset; if (EltSize > Size) { EltSize = Size; Size = 0; } else { Size -= EltSize; } if (II->getIntrinsicID() == Intrinsic::lifetime_start) Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); else Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); ++Idx; } for (; Idx != NewElts.size() && Size; ++Idx) { IdxTy = NewElts[Idx]->getAllocatedType(); uint64_t EltSize = DL.getTypeAllocSize(IdxTy); if (EltSize > Size) { EltSize = Size; Size = 0; } else { Size -= EltSize; } if (II->getIntrinsicID() == Intrinsic::lifetime_start) Builder.CreateLifetimeStart(NewElts[Idx], Builder.getInt64(EltSize)); else Builder.CreateLifetimeEnd(NewElts[Idx], Builder.getInt64(EltSize)); } DeadInsts.push_back(II); } /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVectorImpl &NewElts) { // If this is a memcpy/memmove, construct the other pointer as the // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. Value *OtherPtr = nullptr; unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) OtherPtr = MTI->getRawSource(); else { assert(Inst == MTI->getRawSource()); OtherPtr = MTI->getRawDest(); } } // If there is an other pointer, we want to convert it to the same pointer // type as AI has, so we can GEP through it safely. if (OtherPtr) { unsigned AddrSpace = cast(OtherPtr->getType())->getAddressSpace(); // Remove bitcasts and all-zero GEPs from OtherPtr. This is an // optimization, but it's also required to detect the corner case where // both pointer operands are referencing the same memory, and where // OtherPtr may be a bitcast or GEP that currently being rewritten. (This // function is only called for mem intrinsics that access the whole // aggregate, so non-zero GEPs are not an issue here.) OtherPtr = OtherPtr->stripPointerCasts(); // Copying the alloca to itself is a no-op: just delete it. if (OtherPtr == AI || OtherPtr == NewElts[0]) { // This code will run twice for a no-op memcpy -- once for each operand. // Put only one reference to MI on the DeadInsts list. for (SmallVectorImpl::const_iterator I = DeadInsts.begin(), E = DeadInsts.end(); I != E; ++I) if (*I == MI) return; DeadInsts.push_back(MI); return; } // If the pointer is not the right type, insert a bitcast to the right // type. Type *NewTy = PointerType::get(AI->getType()->getElementType(), AddrSpace); if (OtherPtr->getType() != NewTy) OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); } // Process each element of the aggregate. bool SROADest = MI->getRawDest() == Inst; Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); const DataLayout &DL = MI->getModule()->getDataLayout(); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. Value *OtherElt = nullptr; unsigned OtherEltAlign = MemAlignment; if (OtherPtr) { Value *Idx[2] = { Zero, ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, OtherPtr->getName()+"."+Twine(i), MI); uint64_t EltOffset; PointerType *OtherPtrTy = cast(OtherPtr->getType()); Type *OtherTy = OtherPtrTy->getElementType(); if (StructType *ST = dyn_cast(OtherTy)) { EltOffset = DL.getStructLayout(ST)->getElementOffset(i); } else { Type *EltTy = cast(OtherTy)->getElementType(); EltOffset = DL.getTypeAllocSize(EltTy) * i; } // The alignment of the other pointer is the guaranteed alignment of the // element, which is affected by both the known alignment of the whole // mem intrinsic and the alignment of the element. If the alignment of // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the // known alignment is just 4 bytes. OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); } Value *EltPtr = NewElts[i]; Type *EltTy = cast(EltPtr->getType())->getElementType(); // If we got down to a scalar, insert a load or store as appropriate. if (EltTy->isSingleValueType()) { if (isa(MI)) { if (SROADest) { // From Other to Alloca. Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI); new StoreInst(Elt, EltPtr, MI); } else { // From Alloca to Other. Value *Elt = new LoadInst(EltPtr, "tmp", MI); new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI); } continue; } assert(isa(MI)); // If the stored element is zero (common case), just store a null // constant. Constant *StoreVal; if (ConstantInt *CI = dyn_cast(MI->getArgOperand(1))) { if (CI->isZero()) { StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> } else { // If EltTy is a vector type, get the element type. Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. unsigned EltSize = DL.getTypeSizeInBits(ValTy); APInt OneVal(EltSize, CI->getZExtValue()); APInt TotalVal(OneVal); // Set each byte. for (unsigned i = 0; 8*i < EltSize; ++i) { TotalVal = TotalVal.shl(8); TotalVal |= OneVal; } // Convert the integer value to the appropriate type. StoreVal = ConstantInt::get(CI->getContext(), TotalVal); if (ValTy->isPointerTy()) StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); else if (ValTy->isFloatingPointTy()) StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); assert(StoreVal->getType() == ValTy && "Type mismatch!"); // If the requested value was a vector constant, create it. if (EltTy->isVectorTy()) { unsigned NumElts = cast(EltTy)->getNumElements(); StoreVal = ConstantVector::getSplat(NumElts, StoreVal); } } new StoreInst(StoreVal, EltPtr, MI); continue; } // Otherwise, if we're storing a byte variable, use a memset call for // this element. } unsigned EltSize = DL.getTypeAllocSize(EltTy); if (!EltSize) continue; IRBuilder<> Builder(MI); // Finally, insert the meminst for this element. if (isa(MI)) { Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, MI->isVolatile()); } else { assert(isa(MI)); Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr if (isa(MI)) Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); else Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); } } DeadInsts.push_back(MI); } /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that /// overwrites the entire allocation. Extract out the pieces of the stored /// integer and store them individually. void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, SmallVectorImpl &NewElts) { // Extract each element out of the integer according to its structure offset // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); Type *AllocaEltTy = AI->getAllocatedType(); const DataLayout &DL = SI->getModule()->getDataLayout(); uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); // Handle tail padding by extending the operand if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, IntegerType::get(SI->getContext(), AllocaSizeBits)); DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. if (StructType *EltSTy = dyn_cast(AllocaEltTy)) { const StructLayout *Layout = DL.getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); if (DL.isBigEndian()) Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy); Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } // Truncate down to an integer of the right size. uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; if (FieldSizeBits != AllocaSizeBits) EltVal = Builder.CreateTrunc(EltVal, IntegerType::get(SI->getContext(), FieldSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == FieldTy) { // Storing to an integer field of this size, just do it. } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). EltVal = Builder.CreateBitCast(EltVal, FieldTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). DestField = Builder.CreateBitCast(DestField, PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); } } else { ArrayType *ATy = cast(AllocaEltTy); Type *ArrayEltTy = ATy->getElementType(); uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy); uint64_t Shift; if (DL.isBigEndian()) Shift = AllocaSizeBits-ElementOffset; else Shift = 0; for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Ignore zero sized fields like {}, they obviously contain no data. if (ElementSizeBits == 0) continue; Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } // Truncate down to an integer of the right size. if (ElementSizeBits != AllocaSizeBits) EltVal = Builder.CreateTrunc(EltVal, IntegerType::get(SI->getContext(), ElementSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == ArrayEltTy) { // Storing to an integer field of this size, just do it. } else if (ArrayEltTy->isFloatingPointTy() || ArrayEltTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). DestField = Builder.CreateBitCast(DestField, PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); if (DL.isBigEndian()) Shift -= ElementOffset; else Shift += ElementOffset; } } DeadInsts.push_back(SI); } /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to /// an integer. Load the individual pieces to form the aggregate value. void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVectorImpl &NewElts) { // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); const DataLayout &DL = LI->getModule()->getDataLayout(); uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. const StructLayout *Layout = nullptr; uint64_t ArrayEltBitOffset = 0; if (StructType *EltSTy = dyn_cast(AllocaEltTy)) { Layout = DL.getStructLayout(EltSTy); } else { Type *ArrayEltTy = cast(AllocaEltTy)->getElementType(); ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy); } Value *ResultVal = Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Load the value from the alloca. If the NewElt is an aggregate, cast // the pointer to an integer of the same size before doing the load. Value *SrcField = NewElts[i]; Type *FieldTy = cast(SrcField->getType())->getElementType(); uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), FieldSizeBits); if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && !FieldTy->isVectorTy()) SrcField = new BitCastInst(SrcField, PointerType::getUnqual(FieldIntTy), "", LI); SrcField = new LoadInst(SrcField, "sroa.load.elt", LI); // If SrcField is a fp or vector of the right size but that isn't an // integer type, bitcast to an integer so we can shift it. if (SrcField->getType() != FieldIntTy) SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI); // Zero extend the field to be the same size as the final alloca so that // we can shift and insert it. if (SrcField->getType() != ResultVal->getType()) SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); // Determine the number of bits to shift SrcField. uint64_t Shift; if (Layout) // Struct case. Shift = Layout->getElementOffsetInBits(i); else // Array case. Shift = i*ArrayEltBitOffset; if (DL.isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); if (Shift) { Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); } // Don't create an 'or x, 0' on the first iteration. if (!isa(ResultVal) || !cast(ResultVal)->isNullValue()) ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI); else ResultVal = SrcField; } // Handle tail padding by truncating the result if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits) ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); LI->replaceAllUsesWith(ResultVal); DeadInsts.push_back(LI); } /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. static bool HasPadding(Type *Ty, const DataLayout &DL) { if (ArrayType *ATy = dyn_cast(Ty)) { Ty = ATy->getElementType(); return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty); } // SROA currently handles only Arrays and Structs. StructType *STy = cast(Ty); const StructLayout *SL = DL.getStructLayout(STy); unsigned PrevFieldBitOffset = 0; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned FieldBitOffset = SL->getElementOffsetInBits(i); // Check to see if there is any padding between this element and the // previous one. if (i) { unsigned PrevFieldEnd = PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1)); if (PrevFieldEnd < FieldBitOffset) return true; } PrevFieldBitOffset = FieldBitOffset; } // Check for tail padding. if (unsigned EltCount = STy->getNumElements()) { unsigned PrevFieldEnd = PrevFieldBitOffset + DL.getTypeSizeInBits(STy->getElementType(EltCount-1)); if (PrevFieldEnd < SL->getSizeInBits()) return true; } return false; } /// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of /// an aggregate can be broken down into elements. Return 0 if not, 3 if safe, /// or 1 if safe after canonicalization has been performed. bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // Loop over the use list of the alloca. We can only transform it if all of // the users are safe to transform. AllocaInfo Info(AI); isSafeForScalarRepl(AI, 0, Info); if (Info.isUnsafe) { DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); return false; } const DataLayout &DL = AI->getModule()->getDataLayout(); // Okay, we know all the users are promotable. If the aggregate is a memcpy // source and destination, we have to be careful. In particular, the memcpy // could be moving around elements that live in structure padding of the LLVM // types, but may actually be used. In these cases, we refuse to promote the // struct. if (Info.isMemCpySrc && Info.isMemCpyDst && HasPadding(AI->getAllocatedType(), DL)) return false; // If the alloca never has an access to just *part* of it, but is accessed // via loads and stores, then we should use ConvertToScalarInfo to promote // the alloca instead of promoting each piece at a time and inserting fission // and fusion code. if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { // If the struct/array just has one element, use basic SRoA. if (StructType *ST = dyn_cast(AI->getAllocatedType())) { if (ST->getNumElements() > 1) return false; } else { if (cast(AI->getAllocatedType())->getNumElements() > 1) return false; } } return true; } diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index fdf1199a66fb..7df3ba01998b 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -1,1564 +1,1591 @@ ; RUN: opt < %s -instcombine -S | FileCheck %s ; This test makes sure that these instructions are properly eliminated. ; PR1822 target datalayout = "e-p:64:64-p1:16:16-p2:32:32:32-p3:64:64:64" define i32 @test1(i32 %A, i32 %B) { %C = select i1 false, i32 %A, i32 %B ret i32 %C ; CHECK-LABEL: @test1( ; CHECK: ret i32 %B } define i32 @test2(i32 %A, i32 %B) { %C = select i1 true, i32 %A, i32 %B ret i32 %C ; CHECK-LABEL: @test2( ; CHECK: ret i32 %A } define i32 @test3(i1 %C, i32 %I) { ; V = I %V = select i1 %C, i32 %I, i32 %I ret i32 %V ; CHECK-LABEL: @test3( ; CHECK: ret i32 %I } define i1 @test4(i1 %C) { ; V = C %V = select i1 %C, i1 true, i1 false ret i1 %V ; CHECK-LABEL: @test4( ; CHECK: ret i1 %C } define i1 @test5(i1 %C) { ; V = !C %V = select i1 %C, i1 false, i1 true ret i1 %V ; CHECK-LABEL: @test5( ; CHECK: xor i1 %C, true ; CHECK: ret i1 } define i32 @test6(i1 %C) { ; V = cast C to int %V = select i1 %C, i32 1, i32 0 ret i32 %V ; CHECK-LABEL: @test6( ; CHECK: %V = zext i1 %C to i32 ; CHECK: ret i32 %V } define i1 @test7(i1 %C, i1 %X) { ; R = or C, X %R = select i1 %C, i1 true, i1 %X ret i1 %R ; CHECK-LABEL: @test7( ; CHECK: %R = or i1 %C, %X ; CHECK: ret i1 %R } define i1 @test8(i1 %C, i1 %X) { ; R = and C, X %R = select i1 %C, i1 %X, i1 false ret i1 %R ; CHECK-LABEL: @test8( ; CHECK: %R = and i1 %C, %X ; CHECK: ret i1 %R } define i1 @test9(i1 %C, i1 %X) { ; R = and !C, X %R = select i1 %C, i1 false, i1 %X ret i1 %R ; CHECK-LABEL: @test9( ; CHECK: xor i1 %C, true ; CHECK: %R = and i1 ; CHECK: ret i1 %R } define i1 @test10(i1 %C, i1 %X) { ; R = or !C, X %R = select i1 %C, i1 %X, i1 true ret i1 %R ; CHECK-LABEL: @test10( ; CHECK: xor i1 %C, true ; CHECK: %R = or i1 ; CHECK: ret i1 %R } define i32 @test11(i32 %a) { %C = icmp eq i32 %a, 0 %R = select i1 %C, i32 0, i32 1 ret i32 %R ; CHECK-LABEL: @test11( ; CHECK: icmp ne i32 %a, 0 ; CHECK: %R = zext i1 ; CHECK: ret i32 %R } define i32 @test12(i1 %cond, i32 %a) { %b = or i32 %a, 1 %c = select i1 %cond, i32 %b, i32 %a ret i32 %c ; CHECK-LABEL: @test12( ; CHECK: %b = zext i1 %cond to i32 ; CHECK: %c = or i32 %b, %a ; CHECK: ret i32 %c } define i32 @test12a(i1 %cond, i32 %a) { %b = ashr i32 %a, 1 %c = select i1 %cond, i32 %b, i32 %a ret i32 %c ; CHECK-LABEL: @test12a( ; CHECK: %b = zext i1 %cond to i32 ; CHECK: %c = ashr i32 %a, %b ; CHECK: ret i32 %c } define i32 @test12b(i1 %cond, i32 %a) { %b = ashr i32 %a, 1 %c = select i1 %cond, i32 %a, i32 %b ret i32 %c ; CHECK-LABEL: @test12b( ; CHECK: zext i1 %cond to i32 ; CHECK: %b = xor i32 ; CHECK: %c = ashr i32 %a, %b ; CHECK: ret i32 %c } define i32 @test13(i32 %a, i32 %b) { %C = icmp eq i32 %a, %b %V = select i1 %C, i32 %a, i32 %b ret i32 %V ; CHECK-LABEL: @test13( ; CHECK: ret i32 %b } define i32 @test13a(i32 %a, i32 %b) { %C = icmp ne i32 %a, %b %V = select i1 %C, i32 %a, i32 %b ret i32 %V ; CHECK-LABEL: @test13a( ; CHECK: ret i32 %a } define i32 @test13b(i32 %a, i32 %b) { %C = icmp eq i32 %a, %b %V = select i1 %C, i32 %b, i32 %a ret i32 %V ; CHECK-LABEL: @test13b( ; CHECK: ret i32 %a } define i1 @test14a(i1 %C, i32 %X) { %V = select i1 %C, i32 %X, i32 0 ; (X < 1) | !C %R = icmp slt i32 %V, 1 ret i1 %R ; CHECK-LABEL: @test14a( ; CHECK: icmp slt i32 %X, 1 ; CHECK: xor i1 %C, true ; CHECK: or i1 ; CHECK: ret i1 %R } define i1 @test14b(i1 %C, i32 %X) { %V = select i1 %C, i32 0, i32 %X ; (X < 1) | C %R = icmp slt i32 %V, 1 ret i1 %R ; CHECK-LABEL: @test14b( ; CHECK: icmp slt i32 %X, 1 ; CHECK: or i1 ; CHECK: ret i1 %R } ;; Code sequence for (X & 16) ? 16 : 0 define i32 @test15a(i32 %X) { %t1 = and i32 %X, 16 %t2 = icmp eq i32 %t1, 0 %t3 = select i1 %t2, i32 0, i32 16 ret i32 %t3 ; CHECK-LABEL: @test15a( ; CHECK: %t1 = and i32 %X, 16 ; CHECK: ret i32 %t1 } ;; Code sequence for (X & 32) ? 0 : 24 define i32 @test15b(i32 %X) { %t1 = and i32 %X, 32 %t2 = icmp eq i32 %t1, 0 %t3 = select i1 %t2, i32 32, i32 0 ret i32 %t3 ; CHECK-LABEL: @test15b( ; CHECK: %t1 = and i32 %X, 32 ; CHECK: xor i32 %t1, 32 ; CHECK: ret i32 } ;; Alternate code sequence for (X & 16) ? 16 : 0 define i32 @test15c(i32 %X) { %t1 = and i32 %X, 16 %t2 = icmp eq i32 %t1, 16 %t3 = select i1 %t2, i32 16, i32 0 ret i32 %t3 ; CHECK-LABEL: @test15c( ; CHECK: %t1 = and i32 %X, 16 ; CHECK: ret i32 %t1 } ;; Alternate code sequence for (X & 16) ? 16 : 0 define i32 @test15d(i32 %X) { %t1 = and i32 %X, 16 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 16, i32 0 ret i32 %t3 ; CHECK-LABEL: @test15d( ; CHECK: %t1 = and i32 %X, 16 ; CHECK: ret i32 %t1 } ;; (a & 128) ? 256 : 0 define i32 @test15e(i32 %X) { %t1 = and i32 %X, 128 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 256, i32 0 ret i32 %t3 ; CHECK-LABEL: @test15e( ; CHECK: %t1 = shl i32 %X, 1 ; CHECK: and i32 %t1, 256 ; CHECK: ret i32 } ;; (a & 128) ? 0 : 256 define i32 @test15f(i32 %X) { %t1 = and i32 %X, 128 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 0, i32 256 ret i32 %t3 ; CHECK-LABEL: @test15f( ; CHECK: %t1 = shl i32 %X, 1 ; CHECK: and i32 %t1, 256 ; CHECK: xor i32 %{{.*}}, 256 ; CHECK: ret i32 } ;; (a & 8) ? -1 : -9 define i32 @test15g(i32 %X) { %t1 = and i32 %X, 8 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 -1, i32 -9 ret i32 %t3 ; CHECK-LABEL: @test15g( ; CHECK-NEXT: %1 = or i32 %X, -9 ; CHECK-NEXT: ret i32 %1 } ;; (a & 8) ? -9 : -1 define i32 @test15h(i32 %X) { %t1 = and i32 %X, 8 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 -9, i32 -1 ret i32 %t3 ; CHECK-LABEL: @test15h( ; CHECK-NEXT: %1 = or i32 %X, -9 ; CHECK-NEXT: %2 = xor i32 %1, 8 ; CHECK-NEXT: ret i32 %2 } ;; (a & 2) ? 577 : 1089 define i32 @test15i(i32 %X) { %t1 = and i32 %X, 2 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 577, i32 1089 ret i32 %t3 ; CHECK-LABEL: @test15i( ; CHECK-NEXT: %t1 = shl i32 %X, 8 ; CHECK-NEXT: %1 = and i32 %t1, 512 ; CHECK-NEXT: %2 = xor i32 %1, 512 ; CHECK-NEXT: %3 = add nuw nsw i32 %2, 577 ; CHECK-NEXT: ret i32 %3 } ;; (a & 2) ? 1089 : 577 define i32 @test15j(i32 %X) { %t1 = and i32 %X, 2 %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 1089, i32 577 ret i32 %t3 ; CHECK-LABEL: @test15j( ; CHECK-NEXT: %t1 = shl i32 %X, 8 ; CHECK-NEXT: %1 = and i32 %t1, 512 ; CHECK-NEXT: %2 = add nuw nsw i32 %1, 577 ; CHECK-NEXT: ret i32 %2 } define i32 @test16(i1 %C, i32* %P) { %P2 = select i1 %C, i32* %P, i32* null %V = load i32, i32* %P2 ret i32 %V ; CHECK-LABEL: @test16( ; CHECK-NEXT: %V = load i32, i32* %P ; CHECK: ret i32 %V } ;; It may be legal to load from a null address in a non-zero address space define i32 @test16_neg(i1 %C, i32 addrspace(1)* %P) { %P2 = select i1 %C, i32 addrspace(1)* %P, i32 addrspace(1)* null %V = load i32, i32 addrspace(1)* %P2 ret i32 %V ; CHECK-LABEL: @test16_neg ; CHECK-NEXT: %P2 = select i1 %C, i32 addrspace(1)* %P, i32 addrspace(1)* null ; CHECK-NEXT: %V = load i32, i32 addrspace(1)* %P2 ; CHECK: ret i32 %V } define i32 @test16_neg2(i1 %C, i32 addrspace(1)* %P) { %P2 = select i1 %C, i32 addrspace(1)* null, i32 addrspace(1)* %P %V = load i32, i32 addrspace(1)* %P2 ret i32 %V ; CHECK-LABEL: @test16_neg2 ; CHECK-NEXT: %P2 = select i1 %C, i32 addrspace(1)* null, i32 addrspace(1)* %P ; CHECK-NEXT: %V = load i32, i32 addrspace(1)* %P2 ; CHECK: ret i32 %V } define i1 @test17(i32* %X, i1 %C) { %R = select i1 %C, i32* %X, i32* null %RV = icmp eq i32* %R, null ret i1 %RV ; CHECK-LABEL: @test17( ; CHECK: icmp eq i32* %X, null ; CHECK: xor i1 %C, true ; CHECK: %RV = or i1 ; CHECK: ret i1 %RV } define i32 @test18(i32 %X, i32 %Y, i1 %C) { %R = select i1 %C, i32 %X, i32 0 %V = sdiv i32 %Y, %R ret i32 %V ; CHECK-LABEL: @test18( ; CHECK: %V = sdiv i32 %Y, %X ; CHECK: ret i32 %V } define i32 @test19(i32 %x) { %tmp = icmp ugt i32 %x, 2147483647 %retval = select i1 %tmp, i32 -1, i32 0 ret i32 %retval ; CHECK-LABEL: @test19( ; CHECK-NEXT: ashr i32 %x, 31 ; CHECK-NEXT: ret i32 } define i32 @test20(i32 %x) { %tmp = icmp slt i32 %x, 0 %retval = select i1 %tmp, i32 -1, i32 0 ret i32 %retval ; CHECK-LABEL: @test20( ; CHECK-NEXT: ashr i32 %x, 31 ; CHECK-NEXT: ret i32 } define i64 @test21(i32 %x) { %tmp = icmp slt i32 %x, 0 %retval = select i1 %tmp, i64 -1, i64 0 ret i64 %retval ; CHECK-LABEL: @test21( ; CHECK-NEXT: ashr i32 %x, 31 ; CHECK-NEXT: sext i32 ; CHECK-NEXT: ret i64 } define i16 @test22(i32 %x) { %tmp = icmp slt i32 %x, 0 %retval = select i1 %tmp, i16 -1, i16 0 ret i16 %retval ; CHECK-LABEL: @test22( ; CHECK-NEXT: ashr i32 %x, 31 ; CHECK-NEXT: trunc i32 ; CHECK-NEXT: ret i16 } define i1 @test23(i1 %a, i1 %b) { %c = select i1 %a, i1 %b, i1 %a ret i1 %c ; CHECK-LABEL: @test23( ; CHECK-NEXT: %c = and i1 %a, %b ; CHECK-NEXT: ret i1 %c } define i1 @test24(i1 %a, i1 %b) { %c = select i1 %a, i1 %a, i1 %b ret i1 %c ; CHECK-LABEL: @test24( ; CHECK-NEXT: %c = or i1 %a, %b ; CHECK-NEXT: ret i1 %c } define i32 @test25(i1 %c) { entry: br i1 %c, label %jump, label %ret jump: br label %ret ret: %a = phi i1 [true, %jump], [false, %entry] %b = select i1 %a, i32 10, i32 20 ret i32 %b ; CHECK-LABEL: @test25( ; CHECK: %a = phi i32 [ 10, %jump ], [ 20, %entry ] ; CHECK-NEXT: ret i32 %a } define i32 @test26(i1 %cond) { entry: br i1 %cond, label %jump, label %ret jump: %c = or i1 false, false br label %ret ret: %a = phi i1 [true, %entry], [%c, %jump] %b = select i1 %a, i32 20, i32 10 ret i32 %b ; CHECK-LABEL: @test26( ; CHECK: %a = phi i32 [ 20, %entry ], [ 10, %jump ] ; CHECK-NEXT: ret i32 %a } define i32 @test27(i1 %c, i32 %A, i32 %B) { entry: br i1 %c, label %jump, label %ret jump: br label %ret ret: %a = phi i1 [true, %jump], [false, %entry] %b = select i1 %a, i32 %A, i32 %B ret i32 %b ; CHECK-LABEL: @test27( ; CHECK: %a = phi i32 [ %A, %jump ], [ %B, %entry ] ; CHECK-NEXT: ret i32 %a } define i32 @test28(i1 %cond, i32 %A, i32 %B) { entry: br i1 %cond, label %jump, label %ret jump: br label %ret ret: %c = phi i32 [%A, %jump], [%B, %entry] %a = phi i1 [true, %jump], [false, %entry] %b = select i1 %a, i32 %A, i32 %c ret i32 %b ; CHECK-LABEL: @test28( ; CHECK: %a = phi i32 [ %A, %jump ], [ %B, %entry ] ; CHECK-NEXT: ret i32 %a } define i32 @test29(i1 %cond, i32 %A, i32 %B) { entry: br i1 %cond, label %jump, label %ret jump: br label %ret ret: %c = phi i32 [%A, %jump], [%B, %entry] %a = phi i1 [true, %jump], [false, %entry] br label %next next: %b = select i1 %a, i32 %A, i32 %c ret i32 %b ; CHECK-LABEL: @test29( ; CHECK: %a = phi i32 [ %A, %jump ], [ %B, %entry ] ; CHECK: ret i32 %a } ; SMAX(SMAX(x, y), x) -> SMAX(x, y) define i32 @test30(i32 %x, i32 %y) { %cmp = icmp sgt i32 %x, %y %cond = select i1 %cmp, i32 %x, i32 %y %cmp5 = icmp sgt i32 %cond, %x %retval = select i1 %cmp5, i32 %cond, i32 %x ret i32 %retval ; CHECK-LABEL: @test30( ; CHECK: ret i32 %cond } ; UMAX(UMAX(x, y), x) -> UMAX(x, y) define i32 @test31(i32 %x, i32 %y) { %cmp = icmp ugt i32 %x, %y %cond = select i1 %cmp, i32 %x, i32 %y %cmp5 = icmp ugt i32 %cond, %x %retval = select i1 %cmp5, i32 %cond, i32 %x ret i32 %retval ; CHECK-LABEL: @test31( ; CHECK: ret i32 %cond } ; SMIN(SMIN(x, y), x) -> SMIN(x, y) define i32 @test32(i32 %x, i32 %y) { %cmp = icmp sgt i32 %x, %y %cond = select i1 %cmp, i32 %y, i32 %x %cmp5 = icmp sgt i32 %cond, %x %retval = select i1 %cmp5, i32 %x, i32 %cond ret i32 %retval ; CHECK-LABEL: @test32( ; CHECK: ret i32 %cond } ; MAX(MIN(x, y), x) -> x define i32 @test33(i32 %x, i32 %y) { %cmp = icmp sgt i32 %x, %y %cond = select i1 %cmp, i32 %y, i32 %x %cmp5 = icmp sgt i32 %cond, %x %retval = select i1 %cmp5, i32 %cond, i32 %x ret i32 %retval ; CHECK-LABEL: @test33( ; CHECK: ret i32 %x } ; MIN(MAX(x, y), x) -> x define i32 @test34(i32 %x, i32 %y) { %cmp = icmp sgt i32 %x, %y %cond = select i1 %cmp, i32 %x, i32 %y %cmp5 = icmp sgt i32 %cond, %x %retval = select i1 %cmp5, i32 %x, i32 %cond ret i32 %retval ; CHECK-LABEL: @test34( ; CHECK: ret i32 %x } define i32 @test35(i32 %x) { %cmp = icmp sge i32 %x, 0 %cond = select i1 %cmp, i32 60, i32 100 ret i32 %cond ; CHECK-LABEL: @test35( ; CHECK: ashr i32 %x, 31 ; CHECK: and i32 {{.*}}, 40 ; CHECK: add nuw nsw i32 {{.*}}, 60 ; CHECK: ret } define i32 @test36(i32 %x) { %cmp = icmp slt i32 %x, 0 %cond = select i1 %cmp, i32 60, i32 100 ret i32 %cond ; CHECK-LABEL: @test36( ; CHECK: ashr i32 %x, 31 ; CHECK: and i32 {{.*}}, -40 ; CHECK: add nsw i32 {{.*}}, 100 ; CHECK: ret } define i32 @test37(i32 %x) { %cmp = icmp sgt i32 %x, -1 %cond = select i1 %cmp, i32 1, i32 -1 ret i32 %cond ; CHECK-LABEL: @test37( ; CHECK: ashr i32 %x, 31 ; CHECK: or i32 {{.*}}, 1 ; CHECK: ret } define i1 @test38(i1 %cond) { %zero = alloca i32 %one = alloca i32 %ptr = select i1 %cond, i32* %zero, i32* %one %isnull = icmp eq i32* %ptr, null ret i1 %isnull ; CHECK-LABEL: @test38( ; CHECK: ret i1 false } define i1 @test39(i1 %cond, double %x) { %s = select i1 %cond, double %x, double 0x7FF0000000000000 ; RHS = +infty %cmp = fcmp ule double %x, %s ret i1 %cmp ; CHECK-LABEL: @test39( ; CHECK: ret i1 true } define i1 @test40(i1 %cond) { %a = alloca i32 %b = alloca i32 %c = alloca i32 %s = select i1 %cond, i32* %a, i32* %b %r = icmp eq i32* %s, %c ret i1 %r ; CHECK-LABEL: @test40( ; CHECK: ret i1 false } define i32 @test41(i1 %cond, i32 %x, i32 %y) { %z = and i32 %x, %y %s = select i1 %cond, i32 %y, i32 %z %r = and i32 %x, %s ret i32 %r ; CHECK-LABEL: @test41( ; CHECK-NEXT: and i32 %x, %y ; CHECK-NEXT: ret i32 } define i32 @test42(i32 %x, i32 %y) { %b = add i32 %y, -1 %cond = icmp eq i32 %x, 0 %c = select i1 %cond, i32 %b, i32 %y ret i32 %c ; CHECK-LABEL: @test42( ; CHECK-NEXT: %cond = icmp eq i32 %x, 0 ; CHECK-NEXT: %b = sext i1 %cond to i32 ; CHECK-NEXT: %c = add i32 %b, %y ; CHECK-NEXT: ret i32 %c } define i64 @test43(i32 %a) nounwind { %a_ext = sext i32 %a to i64 %is_a_nonnegative = icmp sgt i32 %a, -1 %max = select i1 %is_a_nonnegative, i64 %a_ext, i64 0 ret i64 %max ; CHECK-LABEL: @test43( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonnegative = icmp slt i64 %a_ext, 0 ; CHECK-NEXT: %max = select i1 %is_a_nonnegative, i64 0, i64 %a_ext ; CHECK-NEXT: ret i64 %max } define i64 @test44(i32 %a) nounwind { %a_ext = sext i32 %a to i64 %is_a_nonpositive = icmp slt i32 %a, 1 %min = select i1 %is_a_nonpositive, i64 %a_ext, i64 0 ret i64 %min ; CHECK-LABEL: @test44( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonpositive = icmp sgt i64 %a_ext, 0 ; CHECK-NEXT: %min = select i1 %is_a_nonpositive, i64 0, i64 %a_ext ; CHECK-NEXT: ret i64 %min } define i64 @test45(i32 %a) nounwind { %a_ext = zext i32 %a to i64 %is_a_nonnegative = icmp ugt i32 %a, 2 %max = select i1 %is_a_nonnegative, i64 %a_ext, i64 3 ret i64 %max ; CHECK-LABEL: @test45( ; CHECK-NEXT: %a_ext = zext i32 %a to i64 ; CHECK-NEXT: %is_a_nonnegative = icmp ult i64 %a_ext, 3 ; CHECK-NEXT: %max = select i1 %is_a_nonnegative, i64 3, i64 %a_ext ; CHECK-NEXT: ret i64 %max } define i64 @test46(i32 %a) nounwind { %a_ext = zext i32 %a to i64 %is_a_nonpositive = icmp ult i32 %a, 3 %min = select i1 %is_a_nonpositive, i64 %a_ext, i64 2 ret i64 %min ; CHECK-LABEL: @test46( ; CHECK-NEXT: %a_ext = zext i32 %a to i64 ; CHECK-NEXT: %is_a_nonpositive = icmp ugt i64 %a_ext, 2 ; CHECK-NEXT: %min = select i1 %is_a_nonpositive, i64 2, i64 %a_ext ; CHECK-NEXT: ret i64 %min } define i64 @test47(i32 %a) nounwind { %a_ext = sext i32 %a to i64 %is_a_nonnegative = icmp ugt i32 %a, 2 %max = select i1 %is_a_nonnegative, i64 %a_ext, i64 3 ret i64 %max ; CHECK-LABEL: @test47( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonnegative = icmp ult i64 %a_ext, 3 ; CHECK-NEXT: %max = select i1 %is_a_nonnegative, i64 3, i64 %a_ext ; CHECK-NEXT: ret i64 %max } define i64 @test48(i32 %a) nounwind { %a_ext = sext i32 %a to i64 %is_a_nonpositive = icmp ult i32 %a, 3 %min = select i1 %is_a_nonpositive, i64 %a_ext, i64 2 ret i64 %min ; CHECK-LABEL: @test48( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonpositive = icmp ugt i64 %a_ext, 2 ; CHECK-NEXT: %min = select i1 %is_a_nonpositive, i64 2, i64 %a_ext ; CHECK-NEXT: ret i64 %min } define i64 @test49(i32 %a) nounwind { %a_ext = sext i32 %a to i64 %is_a_nonpositive = icmp ult i32 %a, 3 %min = select i1 %is_a_nonpositive, i64 2, i64 %a_ext ret i64 %min ; CHECK-LABEL: @test49( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonpositive = icmp ugt i64 %a_ext, 2 ; CHECK-NEXT: %min = select i1 %is_a_nonpositive, i64 %a_ext, i64 2 ; CHECK-NEXT: ret i64 %min } define i64 @test50(i32 %a) nounwind { %is_a_nonpositive = icmp ult i32 %a, 3 %a_ext = sext i32 %a to i64 %min = select i1 %is_a_nonpositive, i64 2, i64 %a_ext ret i64 %min ; CHECK-LABEL: @test50( ; CHECK-NEXT: %a_ext = sext i32 %a to i64 ; CHECK-NEXT: %is_a_nonpositive = icmp ugt i64 %a_ext, 2 ; CHECK-NEXT: %min = select i1 %is_a_nonpositive, i64 %a_ext, i64 2 ; CHECK-NEXT: ret i64 %min } ; PR8994 ; This select instruction can't be eliminated because trying to do so would ; change the number of vector elements. This used to assert. define i48 @test51(<3 x i1> %icmp, <3 x i16> %tmp) { ; CHECK-LABEL: @test51( %select = select <3 x i1> %icmp, <3 x i16> zeroinitializer, <3 x i16> %tmp ; CHECK: select <3 x i1> %tmp2 = bitcast <3 x i16> %select to i48 ret i48 %tmp2 } ; PR8575 define i32 @test52(i32 %n, i32 %m) nounwind { ; CHECK-LABEL: @test52( %cmp = icmp sgt i32 %n, %m %. = select i1 %cmp, i32 1, i32 3 %add = add nsw i32 %., 3 %storemerge = select i1 %cmp, i32 %., i32 %add ; CHECK: select i1 %cmp, i32 1, i32 6 ret i32 %storemerge } ; PR9454 define i32 @test53(i32 %x) nounwind { %and = and i32 %x, 2 %cmp = icmp eq i32 %and, %x %sel = select i1 %cmp, i32 2, i32 1 ret i32 %sel ; CHECK-LABEL: @test53( ; CHECK: select i1 %cmp ; CHECK: ret } define i32 @test54(i32 %X, i32 %Y) { %A = ashr exact i32 %X, %Y %B = icmp eq i32 %A, 0 %C = select i1 %B, i32 %A, i32 1 ret i32 %C ; CHECK-LABEL: @test54( ; CHECK-NOT: ashr ; CHECK-NOT: select ; CHECK: icmp ne i32 %X, 0 ; CHECK: zext ; CHECK: ret } define i1 @test55(i1 %X, i32 %Y, i32 %Z) { %A = ashr exact i32 %Y, %Z %B = select i1 %X, i32 %Y, i32 %A %C = icmp eq i32 %B, 0 ret i1 %C ; CHECK-LABEL: @test55( ; CHECK-NOT: ashr ; CHECK-NOT: select ; CHECK: icmp eq ; CHECK: ret i1 } define i32 @test56(i16 %x) nounwind { %tobool = icmp eq i16 %x, 0 %conv = zext i16 %x to i32 %cond = select i1 %tobool, i32 0, i32 %conv ret i32 %cond ; CHECK-LABEL: @test56( ; CHECK-NEXT: zext ; CHECK-NEXT: ret } define i32 @test57(i32 %x, i32 %y) nounwind { %and = and i32 %x, %y %tobool = icmp eq i32 %x, 0 %.and = select i1 %tobool, i32 0, i32 %and ret i32 %.and ; CHECK-LABEL: @test57( ; CHECK-NEXT: and i32 %x, %y ; CHECK-NEXT: ret } define i32 @test58(i16 %x) nounwind { %tobool = icmp ne i16 %x, 1 %conv = zext i16 %x to i32 %cond = select i1 %tobool, i32 %conv, i32 1 ret i32 %cond ; CHECK-LABEL: @test58( ; CHECK-NEXT: zext ; CHECK-NEXT: ret } define i32 @test59(i32 %x, i32 %y) nounwind { %and = and i32 %x, %y %tobool = icmp ne i32 %x, %y %.and = select i1 %tobool, i32 %and, i32 %y ret i32 %.and ; CHECK-LABEL: @test59( ; CHECK-NEXT: and i32 %x, %y ; CHECK-NEXT: ret } define i1 @test60(i32 %x, i1* %y) nounwind { %cmp = icmp eq i32 %x, 0 %load = load i1, i1* %y, align 1 %cmp1 = icmp slt i32 %x, 1 %sel = select i1 %cmp, i1 %load, i1 %cmp1 ret i1 %sel ; CHECK-LABEL: @test60( ; CHECK: select } @glbl = constant i32 10 define i32 @test61(i32* %ptr) { %A = load i32, i32* %ptr %B = icmp eq i32* %ptr, @glbl %C = select i1 %B, i32 %A, i32 10 ret i32 %C ; CHECK-LABEL: @test61( ; CHECK: ret i32 10 } define i1 @test62(i1 %A, i1 %B) { %not = xor i1 %A, true %C = select i1 %A, i1 %not, i1 %B ret i1 %C ; CHECK-LABEL: @test62( ; CHECK: %not = xor i1 %A, true ; CHECK: %C = and i1 %not, %B ; CHECK: ret i1 %C } define i1 @test63(i1 %A, i1 %B) { %not = xor i1 %A, true %C = select i1 %A, i1 %B, i1 %not ret i1 %C ; CHECK-LABEL: @test63( ; CHECK: %not = xor i1 %A, true ; CHECK: %C = or i1 %B, %not ; CHECK: ret i1 %C } ; PR14131 define void @test64(i32 %p, i16 %b) noreturn nounwind { entry: %p.addr.0.insert.mask = and i32 %p, -65536 %conv2 = and i32 %p, 65535 br i1 undef, label %lor.rhs, label %lor.end lor.rhs: %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16 %phitmp = zext i16 %p.addr.0.extract.trunc to i32 br label %lor.end lor.end: %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ] %conv6 = zext i16 %b to i32 %div = udiv i32 %conv6, %t.1 %tobool8 = icmp eq i32 %div, 0 %cmp = icmp eq i32 %t.1, 0 %cmp12 = icmp ult i32 %conv2, 2 %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp br i1 %cmp.sink, label %cond.end17, label %cond.false16 cond.false16: br label %cond.end17 cond.end17: br label %while.body while.body: br label %while.body ; CHECK-LABEL: @test64( ; CHECK-NOT: select } ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2( ; CHECK-NEXT: [[SHL:%[a-z0-9]+]] = shl i32 %x, 1 ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[SHL]], 2 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[AND]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) { %and = and i32 %x, 1 %cmp = icmp eq i32 %and, 0 %or = or i32 %y, 2 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_eq_and_32_0_or_8( ; CHECK-NEXT: [[LSHR:%[a-z0-9]+]] = lshr i32 %x, 2 ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[LSHR]], 8 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[AND]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_eq_and_32_0_or_8(i32 %x, i32 %y) { %and = and i32 %x, 32 %cmp = icmp eq i32 %and, 0 %or = or i32 %y, 8 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_ne_0_and_4096_or_4096( ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 4096 ; CHECK-NEXT: [[XOR:%[a-z0-9]+]] = xor i32 [[AND]], 4096 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[XOR]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_ne_0_and_4096_or_4096(i32 %x, i32 %y) { %and = and i32 %x, 4096 %cmp = icmp ne i32 0, %and %or = or i32 %y, 4096 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_eq_and_4096_0_or_4096( ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 4096 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[AND]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_eq_and_4096_0_or_4096(i32 %x, i32 %y) { %and = and i32 %x, 4096 %cmp = icmp eq i32 %and, 0 %or = or i32 %y, 4096 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_eq_0_and_1_or_1( ; CHECK-NEXT: [[TRUNC:%.+]] = trunc i64 %x to i32 ; CHECK-NEXT: [[AND:%.+]] = and i32 [[TRUNC]], 1 ; CHECK-NEXT: [[OR:%.+]] = or i32 [[XOR]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_eq_0_and_1_or_1(i64 %x, i32 %y) { %and = and i64 %x, 1 %cmp = icmp eq i64 %and, 0 %or = or i32 %y, 1 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_ne_0_and_4096_or_32( ; CHECK-NEXT: [[LSHR:%[a-z0-9]+]] = lshr i32 %x, 7 ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[LSHR]], 32 ; CHECK-NEXT: [[XOR:%[a-z0-9]+]] = xor i32 [[AND]], 32 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[XOR]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_ne_0_and_4096_or_32(i32 %x, i32 %y) { %and = and i32 %x, 4096 %cmp = icmp ne i32 0, %and %or = or i32 %y, 32 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_ne_0_and_32_or_4096( ; CHECK-NEXT: [[SHL:%[a-z0-9]+]] = shl i32 %x, 7 ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[SHL]], 4096 ; CHECK-NEXT: [[XOR:%[a-z0-9]+]] = xor i32 [[AND]], 4096 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[XOR]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_ne_0_and_32_or_4096(i32 %x, i32 %y) { %and = and i32 %x, 32 %cmp = icmp ne i32 0, %and %or = or i32 %y, 4096 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; CHECK-LABEL: @select_icmp_ne_0_and_1073741824_or_8( ; CHECK-NEXT: [[LSHR:%.+]] = lshr i32 %x, 27 ; CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[LSHR]] to i8 ; CHECK-NEXT: [[AND:%.+]] = and i8 [[TRUNC]], 8 ; CHECK-NEXT: [[XOR:%.+]] = xor i8 [[AND]], 8 ; CHECK-NEXT: [[OR:%.+]] = or i8 [[XOR]], %y ; CHECK-NEXT: ret i8 [[OR]] define i8 @select_icmp_ne_0_and_1073741824_or_8(i32 %x, i8 %y) { %and = and i32 %x, 1073741824 %cmp = icmp ne i32 0, %and %or = or i8 %y, 8 %select = select i1 %cmp, i8 %y, i8 %or ret i8 %select } ; CHECK-LABEL: @select_icmp_ne_0_and_8_or_1073741824( ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i8 %x, 8 ; CHECK-NEXT: [[ZEXT:%[a-z0-9]+]] = zext i8 [[AND]] to i32 ; CHECK-NEXT: [[SHL:%[a-z0-9]+]] = shl nuw nsw i32 [[ZEXT]], 27 ; CHECK-NEXT: [[XOR:%[a-z0-9]+]] = xor i32 [[SHL]], 1073741824 ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[XOR]], %y ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_ne_0_and_8_or_1073741824(i8 %x, i32 %y) { %and = and i8 %x, 8 %cmp = icmp ne i8 0, %and %or = or i32 %y, 1073741824 %select = select i1 %cmp, i32 %y, i32 %or ret i32 %select } ; We can't combine here, because the cmp is scalar and the or vector. ; Just make sure we don't assert. define <2 x i32> @select_icmp_eq_and_1_0_or_vector_of_2s(i32 %x, <2 x i32> %y) { %and = and i32 %x, 1 %cmp = icmp eq i32 %and, 0 %or = or <2 x i32> %y, %select = select i1 %cmp, <2 x i32> %y, <2 x i32> %or ret <2 x i32> %select } ; CHECK-LABEL: @select_icmp_and_8_ne_0_xor_8( ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, -9 ; CHECK-NEXT: ret i32 [[AND]] define i32 @select_icmp_and_8_ne_0_xor_8(i32 %x) { %and = and i32 %x, 8 %cmp = icmp eq i32 %and, 0 %xor = xor i32 %x, 8 %x.xor = select i1 %cmp, i32 %x, i32 %xor ret i32 %x.xor } ; CHECK-LABEL: @select_icmp_and_8_eq_0_xor_8( ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, 8 ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_and_8_eq_0_xor_8(i32 %x) { %and = and i32 %x, 8 %cmp = icmp eq i32 %and, 0 %xor = xor i32 %x, 8 %xor.x = select i1 %cmp, i32 %xor, i32 %x ret i32 %xor.x } ; CHECK-LABEL: @select_icmp_x_and_8_eq_0_y_xor_8( ; CHECK: select i1 %cmp, i64 %y, i64 %xor define i64 @select_icmp_x_and_8_eq_0_y_xor_8(i32 %x, i64 %y) { %and = and i32 %x, 8 %cmp = icmp eq i32 %and, 0 %xor = xor i64 %y, 8 %y.xor = select i1 %cmp, i64 %y, i64 %xor ret i64 %y.xor } ; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_xor_8( ; CHECK: select i1 %cmp, i64 %xor, i64 %y define i64 @select_icmp_x_and_8_ne_0_y_xor_8(i32 %x, i64 %y) { %and = and i32 %x, 8 %cmp = icmp eq i32 %and, 0 %xor = xor i64 %y, 8 %xor.y = select i1 %cmp, i64 %xor, i64 %y ret i64 %xor.y } ; CHECK-LABEL: @select_icmp_x_and_8_ne_0_y_or_8( ; CHECK: xor i64 %1, 8 ; CHECK: or i64 %2, %y define i64 @select_icmp_x_and_8_ne_0_y_or_8(i32 %x, i64 %y) { %and = and i32 %x, 8 %cmp = icmp eq i32 %and, 0 %or = or i64 %y, 8 %or.y = select i1 %cmp, i64 %or, i64 %y ret i64 %or.y } ; CHECK-LABEL: @select_icmp_and_2147483648_ne_0_xor_2147483648( ; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 2147483647 ; CHECK-NEXT: ret i32 [[AND]] define i32 @select_icmp_and_2147483648_ne_0_xor_2147483648(i32 %x) { %and = and i32 %x, 2147483648 %cmp = icmp eq i32 %and, 0 %xor = xor i32 %x, 2147483648 %x.xor = select i1 %cmp, i32 %x, i32 %xor ret i32 %x.xor } ; CHECK-LABEL: @select_icmp_and_2147483648_eq_0_xor_2147483648( ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, -2147483648 ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_and_2147483648_eq_0_xor_2147483648(i32 %x) { %and = and i32 %x, 2147483648 %cmp = icmp eq i32 %and, 0 %xor = xor i32 %x, 2147483648 %xor.x = select i1 %cmp, i32 %xor, i32 %x ret i32 %xor.x } ; CHECK-LABEL: @select_icmp_x_and_2147483648_ne_0_or_2147483648( ; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 %x, -2147483648 ; CHECK-NEXT: ret i32 [[OR]] define i32 @select_icmp_x_and_2147483648_ne_0_or_2147483648(i32 %x) { %and = and i32 %x, 2147483648 %cmp = icmp eq i32 %and, 0 %or = or i32 %x, 2147483648 %or.x = select i1 %cmp, i32 %or, i32 %x ret i32 %or.x } define i32 @test65(i64 %x) { %1 = and i64 %x, 16 %2 = icmp ne i64 %1, 0 %3 = select i1 %2, i32 40, i32 42 ret i32 %3 ; CHECK-LABEL: @test65( ; CHECK: %[[TRUNC:.*]] = trunc i64 %x to i32 ; CHECK: %[[LSHR:.*]] = lshr i32 %[[TRUNC]], 3 ; CHECK: %[[AND:.*]] = and i32 %[[LSHR]], 2 ; CHECK: %[[XOR:.*]] = xor i32 %[[AND]], 42 ; CHECK: ret i32 %[[XOR]] } define i32 @test66(i64 %x) { %1 = and i64 %x, 4294967296 %2 = icmp ne i64 %1, 0 %3 = select i1 %2, i32 40, i32 42 ret i32 %3 ; CHECK-LABEL: @test66( ; CHECK: select } define i32 @test67(i16 %x) { %1 = and i16 %x, 4 %2 = icmp ne i16 %1, 0 %3 = select i1 %2, i32 40, i32 42 ret i32 %3 ; CHECK-LABEL: @test67( ; CHECK: and i16 %x, 4 ; CHECK: zext i16 %1 to i32 ; CHECK: lshr exact i32 %2, 1 ; CHECK: xor i32 %3, 42 } ; SMIN(SMIN(X, 11), 92) -> SMIN(X, 11) define i32 @test68(i32 %x) { entry: %cmp = icmp slt i32 11, %x %cond = select i1 %cmp, i32 11, i32 %x %cmp3 = icmp slt i32 92, %cond %retval = select i1 %cmp3, i32 92, i32 %cond ret i32 %retval ; CHECK-LABEL: @test68( ; CHECK: ret i32 %cond } ; MIN(MIN(X, 24), 83) -> MIN(X, 24) define i32 @test69(i32 %x) { entry: %cmp = icmp ult i32 24, %x %cond = select i1 %cmp, i32 24, i32 %x %cmp3 = icmp ult i32 83, %cond %retval = select i1 %cmp3, i32 83, i32 %cond ret i32 %retval ; CHECK-LABEL: @test69( ; CHECK: ret i32 %cond } ; SMAX(SMAX(X, 75), 36) -> SMAX(X, 75) define i32 @test70(i32 %x) { entry: %cmp = icmp slt i32 %x, 75 %cond = select i1 %cmp, i32 75, i32 %x %cmp3 = icmp slt i32 %cond, 36 %retval = select i1 %cmp3, i32 36, i32 %cond ret i32 %retval ; CHECK-LABEL: @test70( ; CHECK: ret i32 %cond } ; MAX(MAX(X, 68), 47) -> MAX(X, 68) define i32 @test71(i32 %x) { entry: %cmp = icmp ult i32 %x, 68 %cond = select i1 %cmp, i32 68, i32 %x %cmp3 = icmp ult i32 %cond, 47 %retval = select i1 %cmp3, i32 47, i32 %cond ret i32 %retval ; CHECK-LABEL: @test71( ; CHECK: ret i32 %cond } ; SMIN(SMIN(X, 92), 11) -> SMIN(X, 11) define i32 @test72(i32 %x) { %cmp = icmp sgt i32 %x, 92 %cond = select i1 %cmp, i32 92, i32 %x %cmp3 = icmp sgt i32 %cond, 11 %retval = select i1 %cmp3, i32 11, i32 %cond ret i32 %retval ; CHECK-LABEL: @test72( ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 11 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 11, i32 %x ; CHECK-NEXT: ret i32 [[SEL]] } ; MIN(MIN(X, 83), 24) -> MIN(X, 24) define i32 @test73(i32 %x) { %cmp = icmp ugt i32 %x, 83 %cond = select i1 %cmp, i32 83, i32 %x %cmp3 = icmp ugt i32 %cond, 24 %retval = select i1 %cmp3, i32 24, i32 %cond ret i32 %retval ; CHECK-LABEL: @test73( ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ugt i32 %x, 24 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 24, i32 %x ; CHECK-NEXT: ret i32 [[SEL]] } ; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75) define i32 @test74(i32 %x) { %cmp = icmp slt i32 %x, 36 %cond = select i1 %cmp, i32 36, i32 %x %cmp3 = icmp slt i32 %cond, 75 %retval = select i1 %cmp3, i32 75, i32 %cond ret i32 %retval ; CHECK-LABEL: @test74( ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 75 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 75, i32 %x ; CHECK-NEXT: ret i32 [[SEL]] } ; MAX(MAX(X, 47), 68) -> MAX(X, 68) define i32 @test75(i32 %x) { %cmp = icmp ult i32 %x, 47 %cond = select i1 %cmp, i32 47, i32 %x %cmp3 = icmp ult i32 %cond, 68 %retval = select i1 %cmp3, i32 68, i32 %cond ret i32 %retval ; CHECK-LABEL: @test75( ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %x, 68 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x ; CHECK-NEXT: ret i32 [[SEL]] } @under_aligned = external global i32, align 1 define i32 @test76(i1 %flag, i32* %x) { ; The load here must not be speculated around the select. One side of the ; select is trivially dereferencable but may have a lower alignment than the ; load does. ; CHECK-LABEL: @test76( ; CHECK: store i32 0, i32* %x ; CHECK: %[[P:.*]] = select i1 %flag, i32* @under_aligned, i32* %x ; CHECK: load i32, i32* %[[P]] store i32 0, i32* %x %p = select i1 %flag, i32* @under_aligned, i32* %x %v = load i32, i32* %p ret i32 %v } declare void @scribble_on_i32(i32*) define i32 @test77(i1 %flag, i32* %x) { ; The load here must not be speculated around the select. One side of the ; select is trivially dereferencable but may have a lower alignment than the ; load does. ; CHECK-LABEL: @test77( ; CHECK: %[[A:.*]] = alloca i32, align 1 ; CHECK: call void @scribble_on_i32(i32* nonnull %[[A]]) ; CHECK: store i32 0, i32* %x ; CHECK: %[[P:.*]] = select i1 %flag, i32* %[[A]], i32* %x ; CHECK: load i32, i32* %[[P]] %under_aligned = alloca i32, align 1 call void @scribble_on_i32(i32* %under_aligned) store i32 0, i32* %x %p = select i1 %flag, i32* %under_aligned, i32* %x %v = load i32, i32* %p ret i32 %v } define i32 @test78(i1 %flag, i32* %x, i32* %y, i32* %z) { ; Test that we can speculate the loads around the select even when we can't ; fold the load completely away. ; CHECK-LABEL: @test78( ; CHECK: %[[V1:.*]] = load i32, i32* %x ; CHECK-NEXT: %[[V2:.*]] = load i32, i32* %y ; CHECK-NEXT: %[[S:.*]] = select i1 %flag, i32 %[[V1]], i32 %[[V2]] ; CHECK-NEXT: ret i32 %[[S]] entry: store i32 0, i32* %x store i32 0, i32* %y ; Block forwarding by storing to %z which could alias either %x or %y. store i32 42, i32* %z %p = select i1 %flag, i32* %x, i32* %y %v = load i32, i32* %p ret i32 %v } +define i32 @test78_deref(i1 %flag, i32* dereferenceable(4) %x, i32* dereferenceable(4) %y, i32* %z) { +; Test that we can speculate the loads around the select even when we can't +; fold the load completely away. +; CHECK-LABEL: @test78_deref( +; CHECK: %[[V1:.*]] = load i32, i32* %x +; CHECK-NEXT: %[[V2:.*]] = load i32, i32* %y +; CHECK-NEXT: %[[S:.*]] = select i1 %flag, i32 %[[V1]], i32 %[[V2]] +; CHECK-NEXT: ret i32 %[[S]] +entry: + %p = select i1 %flag, i32* %x, i32* %y + %v = load i32, i32* %p + ret i32 %v +} + define i32 @test78_neg(i1 %flag, i32* %x, i32* %y, i32* %z) { ; The same as @test78 but we can't speculate the load because it can trap ; if under-aligned. ; CHECK-LABEL: @test78_neg( ; CHECK: %p = select i1 %flag, i32* %x, i32* %y ; CHECK-NEXT: %v = load i32, i32* %p, align 16 ; CHECK-NEXT: ret i32 %v entry: store i32 0, i32* %x store i32 0, i32* %y ; Block forwarding by storing to %z which could alias either %x or %y. store i32 42, i32* %z %p = select i1 %flag, i32* %x, i32* %y %v = load i32, i32* %p, align 16 ret i32 %v } +define i32 @test78_deref_neg(i1 %flag, i32* dereferenceable(2) %x, i32* dereferenceable(4) %y, i32* %z) { +; The same as @test78_deref but we can't speculate the load because +; one of the arguments is not sufficiently dereferenceable. +; CHECK-LABEL: @test78_deref_neg( +; CHECK: %p = select i1 %flag, i32* %x, i32* %y +; CHECK-NEXT: %v = load i32, i32* %p +; CHECK-NEXT: ret i32 %v +entry: + %p = select i1 %flag, i32* %x, i32* %y + %v = load i32, i32* %p + ret i32 %v +} + define float @test79(i1 %flag, float* %x, i32* %y, i32* %z) { ; Test that we can speculate the loads around the select even when we can't ; fold the load completely away. ; CHECK-LABEL: @test79( ; CHECK: %[[V1:.*]] = load float, float* %x ; CHECK-NEXT: %[[V2:.*]] = load float, float* %y ; CHECK-NEXT: %[[S:.*]] = select i1 %flag, float %[[V1]], float %[[V2]] ; CHECK-NEXT: ret float %[[S]] entry: %x1 = bitcast float* %x to i32* %y1 = bitcast i32* %y to float* store i32 0, i32* %x1 store i32 0, i32* %y ; Block forwarding by storing to %z which could alias either %x or %y. store i32 42, i32* %z %p = select i1 %flag, float* %x, float* %y1 %v = load float, float* %p ret float %v } define i32 @test80(i1 %flag) { ; Test that when we speculate the loads around the select they fold throug ; load->load folding and load->store folding. ; CHECK-LABEL: @test80( ; CHECK: %[[X:.*]] = alloca i32 ; CHECK-NEXT: %[[Y:.*]] = alloca i32 ; CHECK: %[[V:.*]] = load i32, i32* %[[X]] ; CHECK-NEXT: store i32 %[[V]], i32* %[[Y]] ; CHECK-NEXT: ret i32 %[[V]] entry: %x = alloca i32 %y = alloca i32 call void @scribble_on_i32(i32* %x) call void @scribble_on_i32(i32* %y) %tmp = load i32, i32* %x store i32 %tmp, i32* %y %p = select i1 %flag, i32* %x, i32* %y %v = load i32, i32* %p ret i32 %v } define float @test81(i1 %flag) { ; Test that we can speculate the load around the select even though they use ; differently typed pointers. ; CHECK-LABEL: @test81( ; CHECK: %[[X:.*]] = alloca i32 ; CHECK-NEXT: %[[Y:.*]] = alloca i32 ; CHECK: %[[V:.*]] = load i32, i32* %[[X]] ; CHECK-NEXT: store i32 %[[V]], i32* %[[Y]] ; CHECK-NEXT: %[[C:.*]] = bitcast i32 %[[V]] to float ; CHECK-NEXT: ret float %[[C]] entry: %x = alloca float %y = alloca i32 %x1 = bitcast float* %x to i32* %y1 = bitcast i32* %y to float* call void @scribble_on_i32(i32* %x1) call void @scribble_on_i32(i32* %y) %tmp = load i32, i32* %x1 store i32 %tmp, i32* %y %p = select i1 %flag, float* %x, float* %y1 %v = load float, float* %p ret float %v } define i32 @test82(i1 %flag) { ; Test that we can speculate the load around the select even though they use ; differently typed pointers. ; CHECK-LABEL: @test82( ; CHECK: %[[X:.*]] = alloca float ; CHECK-NEXT: %[[Y:.*]] = alloca i32 ; CHECK-NEXT: %[[X1:.*]] = bitcast float* %[[X]] to i32* ; CHECK-NEXT: %[[Y1:.*]] = bitcast i32* %[[Y]] to float* ; CHECK: %[[V:.*]] = load float, float* %[[X]] ; CHECK-NEXT: store float %[[V]], float* %[[Y1]] ; CHECK-NEXT: %[[C:.*]] = bitcast float %[[V]] to i32 ; CHECK-NEXT: ret i32 %[[C]] entry: %x = alloca float %y = alloca i32 %x1 = bitcast float* %x to i32* %y1 = bitcast i32* %y to float* call void @scribble_on_i32(i32* %x1) call void @scribble_on_i32(i32* %y) %tmp = load float, float* %x store float %tmp, float* %y1 %p = select i1 %flag, i32* %x1, i32* %y %v = load i32, i32* %p ret i32 %v } declare void @scribble_on_i64(i64*) declare void @scribble_on_i128(i128*) define i8* @test83(i1 %flag) { ; Test that we can speculate the load around the select even though they use ; differently typed pointers and requires inttoptr casts. ; CHECK-LABEL: @test83( ; CHECK: %[[X:.*]] = alloca i8* ; CHECK-NEXT: %[[Y:.*]] = alloca i8* ; CHECK-DAG: %[[X2:.*]] = bitcast i8** %[[X]] to i64* ; CHECK-DAG: %[[Y2:.*]] = bitcast i8** %[[Y]] to i64* ; CHECK: %[[V:.*]] = load i64, i64* %[[X2]] ; CHECK-NEXT: store i64 %[[V]], i64* %[[Y2]] ; CHECK-NEXT: %[[C:.*]] = inttoptr i64 %[[V]] to i8* ; CHECK-NEXT: ret i8* %[[S]] entry: %x = alloca i8* %y = alloca i64 %x1 = bitcast i8** %x to i64* %y1 = bitcast i64* %y to i8** call void @scribble_on_i64(i64* %x1) call void @scribble_on_i64(i64* %y) %tmp = load i64, i64* %x1 store i64 %tmp, i64* %y %p = select i1 %flag, i8** %x, i8** %y1 %v = load i8*, i8** %p ret i8* %v } define i64 @test84(i1 %flag) { ; Test that we can speculate the load around the select even though they use ; differently typed pointers and requires a ptrtoint cast. ; CHECK-LABEL: @test84( ; CHECK: %[[X:.*]] = alloca i8* ; CHECK-NEXT: %[[Y:.*]] = alloca i8* ; CHECK: %[[V:.*]] = load i8*, i8** %[[X]] ; CHECK-NEXT: store i8* %[[V]], i8** %[[Y]] ; CHECK-NEXT: %[[C:.*]] = ptrtoint i8* %[[V]] to i64 ; CHECK-NEXT: ret i64 %[[C]] entry: %x = alloca i8* %y = alloca i64 %x1 = bitcast i8** %x to i64* %y1 = bitcast i64* %y to i8** call void @scribble_on_i64(i64* %x1) call void @scribble_on_i64(i64* %y) %tmp = load i8*, i8** %x store i8* %tmp, i8** %y1 %p = select i1 %flag, i64* %x1, i64* %y %v = load i64, i64* %p ret i64 %v } define i8* @test85(i1 %flag) { ; Test that we can't speculate the load around the select. The load of the ; pointer doesn't load all of the stored integer bits. We could fix this, but it ; would require endianness checks and other nastiness. ; CHECK-LABEL: @test85( ; CHECK: %[[T:.*]] = load i128, i128* ; CHECK-NEXT: store i128 %[[T]], i128* ; CHECK-NEXT: %[[X:.*]] = load i8*, i8** ; CHECK-NEXT: %[[Y:.*]] = load i8*, i8** ; CHECK-NEXT: %[[V:.*]] = select i1 %flag, i8* %[[X]], i8* %[[Y]] ; CHECK-NEXT: ret i8* %[[V]] entry: %x = alloca [2 x i8*] %y = alloca i128 %x1 = bitcast [2 x i8*]* %x to i8** %x2 = bitcast i8** %x1 to i128* %y1 = bitcast i128* %y to i8** call void @scribble_on_i128(i128* %x2) call void @scribble_on_i128(i128* %y) %tmp = load i128, i128* %x2 store i128 %tmp, i128* %y %p = select i1 %flag, i8** %x1, i8** %y1 %v = load i8*, i8** %p ret i8* %v } define i128 @test86(i1 %flag) { ; Test that we can't speculate the load around the select when the integer size ; is larger than the pointer size. The store of the pointer doesn't store to all ; the bits of the integer. ; ; CHECK-LABEL: @test86( ; CHECK: %[[T:.*]] = load i8*, i8** ; CHECK-NEXT: store i8* %[[T]], i8** ; CHECK-NEXT: %[[X:.*]] = load i128, i128* ; CHECK-NEXT: %[[Y:.*]] = load i128, i128* ; CHECK-NEXT: %[[V:.*]] = select i1 %flag, i128 %[[X]], i128 %[[Y]] ; CHECK-NEXT: ret i128 %[[V]] entry: %x = alloca [2 x i8*] %y = alloca i128 %x1 = bitcast [2 x i8*]* %x to i8** %x2 = bitcast i8** %x1 to i128* %y1 = bitcast i128* %y to i8** call void @scribble_on_i128(i128* %x2) call void @scribble_on_i128(i128* %y) %tmp = load i8*, i8** %x1 store i8* %tmp, i8** %y1 %p = select i1 %flag, i128* %x2, i128* %y %v = load i128, i128* %p ret i128 %v } define i32 @test_select_select0(i32 %a, i32 %r0, i32 %r1, i32 %v1, i32 %v2) { ; CHECK-LABEL: @test_select_select0( ; CHECK: %[[C0:.*]] = icmp sge i32 %a, %v1 ; CHECK-NEXT: %[[C1:.*]] = icmp slt i32 %a, %v2 ; CHECK-NEXT: %[[C:.*]] = and i1 %[[C1]], %[[C0]] ; CHECK-NEXT: %[[SEL:.*]] = select i1 %[[C]], i32 %r0, i32 %r1 ; CHECK-NEXT: ret i32 %[[SEL]] %c0 = icmp sge i32 %a, %v1 %s0 = select i1 %c0, i32 %r0, i32 %r1 %c1 = icmp slt i32 %a, %v2 %s1 = select i1 %c1, i32 %s0, i32 %r1 ret i32 %s1 } define i32 @test_select_select1(i32 %a, i32 %r0, i32 %r1, i32 %v1, i32 %v2) { ; CHECK-LABEL: @test_select_select1( ; CHECK: %[[C0:.*]] = icmp sge i32 %a, %v1 ; CHECK-NEXT: %[[C1:.*]] = icmp slt i32 %a, %v2 ; CHECK-NEXT: %[[C:.*]] = or i1 %[[C1]], %[[C0]] ; CHECK-NEXT: %[[SEL:.*]] = select i1 %[[C]], i32 %r0, i32 %r1 ; CHECK-NEXT: ret i32 %[[SEL]] %c0 = icmp sge i32 %a, %v1 %s0 = select i1 %c0, i32 %r0, i32 %r1 %c1 = icmp slt i32 %a, %v2 %s1 = select i1 %c1, i32 %r0, i32 %s0 ret i32 %s1 } define i32 @test_max_of_min(i32 %a) { ; MAX(MIN(%a, -1), -1) == -1 ; CHECK-LABEL: @test_max_of_min( ; CHECK: ret i32 -1 %not_a = xor i32 %a, -1 %c0 = icmp sgt i32 %a, 0 %s0 = select i1 %c0, i32 %not_a, i32 -1 %c1 = icmp sgt i32 %s0, -1 %s1 = select i1 %c1, i32 %s0, i32 -1 ret i32 %s1 } define i32 @PR23757(i32 %x) { ; CHECK-LABEL: @PR23757 ; CHECK: %[[cmp:.*]] = icmp eq i32 %x, 2147483647 ; CHECK-NEXT: %[[add:.*]] = add nsw i32 %x, 1 ; CHECK-NEXT: %[[sel:.*]] = select i1 %[[cmp]], i32 -2147483648, i32 %[[add]] ; CHECK-NEXT: ret i32 %[[sel]] %cmp = icmp eq i32 %x, 2147483647 %add = add nsw i32 %x, 1 %sel = select i1 %cmp, i32 -2147483648, i32 %add ret i32 %sel } diff --git a/llvm/test/Transforms/TailCallElim/dont_reorder_load.ll b/llvm/test/Transforms/TailCallElim/dont_reorder_load.ll index ac399a1bf5a9..f8542799cc64 100644 --- a/llvm/test/Transforms/TailCallElim/dont_reorder_load.ll +++ b/llvm/test/Transforms/TailCallElim/dont_reorder_load.ll @@ -1,64 +1,82 @@ -; RUN: opt < %s -tailcallelim -S | grep call | count 3 +; RUN: opt < %s -tailcallelim -S | grep call | count 4 ; PR4323 ; Several cases where tail call elimination should not move the load above the ; call, and thus can't eliminate the tail recursion. @extern_weak_global = extern_weak global i32 ; [#uses=1] ; This load can't be safely moved above the call because the load is from an ; extern_weak global and may trap, but the call may unwind before that happens. define fastcc i32 @no_tailrecelim_1(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 37 else: ; preds = %entry %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @no_tailrecelim_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load i32, i32* @extern_weak_global ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } ; This load can't be safely moved above the call because function may write to the pointer. define fastcc i32 @no_tailrecelim_2(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind { entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry store i32 1, i32* %a_arg ret i32 0 else: ; preds = %entry %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @no_tailrecelim_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load i32, i32* %a_arg ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } ; This load can't be safely moved above the call because that would change the ; order in which the load volatiles are performed. define fastcc i32 @no_tailrecelim_3(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind { entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 0 else: ; preds = %entry %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @no_tailrecelim_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load volatile i32, i32* %a_arg ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } + +; This load can NOT be moved above the call because the a_arg is not +; sufficiently dereferenceable. +define fastcc i32 @no_tailrecelim_4(i32* dereferenceable(2) %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @no_tailrecelim_4(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32, i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +} diff --git a/llvm/test/Transforms/TailCallElim/reorder_load.ll b/llvm/test/Transforms/TailCallElim/reorder_load.ll index b989bbf9547a..2f9b692d0991 100644 --- a/llvm/test/Transforms/TailCallElim/reorder_load.ll +++ b/llvm/test/Transforms/TailCallElim/reorder_load.ll @@ -1,124 +1,147 @@ ; RUN: opt < %s -tailcallelim -S | FileCheck %s ; PR4323 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Several cases where tail call elimination should move the load above the call, ; then eliminate the tail recursion. @global = external global i32 ; [#uses=1] @extern_weak_global = extern_weak global i32 ; [#uses=1] ; This load can be moved above the call because the function won't write to it ; and the call has no side effects. define fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly { ; CHECK-LABEL: @raise_load_1( ; CHECK-NOT: call ; CHECK: load i32, i32* ; CHECK-NOT: call ; CHECK: } entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 0 else: ; preds = %entry %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load i32, i32* %a_arg ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } ; This load can be moved above the call because the function won't write to it ; and the load provably can't trap. define fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { ; CHECK-LABEL: @raise_load_2( ; CHECK-NOT: call ; CHECK: load i32, i32* ; CHECK-NOT: call ; CHECK: } entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 0 else: ; preds = %entry %nullcheck = icmp eq i32* %a_arg, null ; [#uses=1] br i1 %nullcheck, label %unwind, label %recurse unwind: ; preds = %else unreachable recurse: ; preds = %else %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load i32, i32* @global ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } ; This load can be safely moved above the call (even though it's from an ; extern_weak global) because the call has no side effects. define fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly { ; CHECK-LABEL: @raise_load_3( ; CHECK-NOT: call ; CHECK: load i32, i32* ; CHECK-NOT: call ; CHECK: } entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 0 else: ; preds = %entry %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %tmp8 = call fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] %tmp9 = load i32, i32* @extern_weak_global ; [#uses=1] %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] ret i32 %tmp10 } ; The second load can be safely moved above the call even though it's from an ; unknown pointer (which normally means it might trap) because the first load ; proves it doesn't trap. define fastcc i32 @raise_load_4(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { ; CHECK-LABEL: @raise_load_4( ; CHECK-NOT: call ; CHECK: load i32, i32* ; CHECK-NEXT: load i32, i32* ; CHECK-NOT: call ; CHECK: } entry: %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] br i1 %tmp2, label %if, label %else if: ; preds = %entry ret i32 0 else: ; preds = %entry %nullcheck = icmp eq i32* %a_arg, null ; [#uses=1] br i1 %nullcheck, label %unwind, label %recurse unwind: ; preds = %else unreachable recurse: ; preds = %else %tmp7 = add i32 %start_arg, 1 ; [#uses=1] %first = load i32, i32* %a_arg ; [#uses=1] %tmp8 = call fastcc i32 @raise_load_4(i32* %a_arg, i32 %first, i32 %tmp7) ; [#uses=1] %second = load i32, i32* %a_arg ; [#uses=1] %tmp10 = add i32 %second, %tmp8 ; [#uses=1] ret i32 %tmp10 } + +; This load can be moved above the call because the function won't write to it +; and the a_arg is dereferenceable. +define fastcc i32 @raise_load_5(i32* dereferenceable(4) %a_arg, i32 %a_len_arg, i32 %start_arg) readonly { +; CHECK-LABEL: @raise_load_5( +; CHECK-NOT: call +; CHECK: load i32, i32* +; CHECK-NOT: call +; CHECK: } +entry: + %tmp2 = icmp sge i32 %start_arg, %a_len_arg ; [#uses=1] + br i1 %tmp2, label %if, label %else + +if: ; preds = %entry + ret i32 0 + +else: ; preds = %entry + %tmp7 = add i32 %start_arg, 1 ; [#uses=1] + %tmp8 = call fastcc i32 @raise_load_5(i32* %a_arg, i32 %a_len_arg, i32 %tmp7) ; [#uses=1] + %tmp9 = load i32, i32* %a_arg ; [#uses=1] + %tmp10 = add i32 %tmp9, %tmp8 ; [#uses=1] + ret i32 %tmp10 +}