Index: llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -0,0 +1,482 @@ +//===- llvm/Transforms/Vectorize/LoopVectorizationLegality.h ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file defines the LoopVectorizationLegality class. Original code +/// in Loop Vectorizer has been moved out to its own file for modularity +/// and reusability. +/// +/// Currently, it works for innermost loop vectorization. Extending this to +/// outer loop vectorization is a TODO item. +/// +/// Also provides: +/// 1) LoopVectorizeHints class which keeps a number of loop annotations +/// locally for easy look up. It has the ability to write them back as +/// loop metadata, upon request. +/// 2) LoopVectorizationRequirements class for lazy bail out for the purpose +/// of reporting useful failure to vectorize message. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H +#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +namespace llvm { + +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, + StringRef RemarkName, + Loop *TheLoop, + Instruction *I = nullptr); + +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. +class LoopVectorizeHints { + enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char *Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char *Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) {} + + bool validate(unsigned Val); + }; + + /// Vectorization width. + Hint Width; + + /// Vectorization interleave factor. + Hint Interleave; + + /// Vectorization forced + Hint Force; + + /// Already Vectorized + Hint IsVectorized; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } + + /// True if there is any unsafe math in the loop. + bool PotentiallyUnsafe = false; + +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; + + LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + OptimizationRemarkEmitter &ORE); + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized() { + IsVectorized.Value = 1; + Hint Hints[] = {IsVectorized}; + writeHintsToMetadata(Hints); + } + + bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const; + + /// Dumps all the hint information. + void emitRemarkWithHints() const; + + unsigned getWidth() const { return Width.Value; } + unsigned getInterleave() const { return Interleave.Value; } + unsigned getIsVectorized() const { return IsVectorized.Value; } + enum ForceKind getForce() const { return (ForceKind)Force.Value; } + + /// \brief If hints are provided that force vectorization, use the AlwaysPrint + /// pass name to force the frontend to print the diagnostic. + const char *vectorizeAnalysisPassName() const; + + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + + bool isPotentiallyUnsafe() const { + // Avoid FP vectorization if the target is unsure about proper support. + // This may be related to the SIMD unit in the target not handling + // IEEE 754 FP ops properly, or bad single-to-double promotions. + // Otherwise, a sequence of vectorized loops, even without reduction, + // could lead to different end results on the destination vectors. + return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; + } + + void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata(); + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg); + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const; + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes); + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes); + + /// The loop these hints belong to. + const Loop *TheLoop; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; +}; + +/// \brief This holds vectorization requirements that must be verified late in +/// the process. The requirements are set by legalize and costmodel. Once +/// vectorization has been determined to be possible and profitable the +/// requirements can be verified by looking for metadata or compiler options. +/// For example, some loops require FP commutativity which is only allowed if +/// vectorization is explicitly specified or if the fast-math compiler option +/// has been provided. +/// Late evaluation of these requirements allows helpful diagnostics to be +/// composed that tells the user what need to be done to vectorize the loop. For +/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late +/// evaluation should be used only when diagnostics can generated that can be +/// followed by a non-expert user. +class LoopVectorizationRequirements { +public: + LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} + + void addUnsafeAlgebraInst(Instruction *I) { + // First unsafe algebra instruction. + if (!UnsafeAlgebraInst) + UnsafeAlgebraInst = I; + } + + void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints); + +private: + unsigned NumRuntimePointerChecks = 0; + Instruction *UnsafeAlgebraInst = nullptr; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; +}; + +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory +/// checks for a number of different conditions, such as the availability of a +/// single induction variable, that all types are supported and vectorize-able, +/// etc. This code reflects the capabilities of InnerLoopVectorizer. +/// This class is also used by InnerLoopVectorizer for identifying +/// induction variable and the different reduction variables. +class LoopVectorizationLegality { +public: + LoopVectorizationLegality( + Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, + TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, + std::function *GetLAA, LoopInfo *LI, + OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, + LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) + : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA), + ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} + + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + using ReductionList = DenseMap; + + /// InductionList saves induction variables and maps them to the + /// induction descriptor. + using InductionList = MapVector; + + /// RecurrenceSet contains the phi nodes that are recurrences other than + /// inductions and reductions. + using RecurrenceSet = SmallPtrSet; + + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + /// Temporarily taking UseVPlanNativePath parameter. If true, take + /// the new code path being implemented for outer loop vectorization + /// (should be functional for inner loop vectorization) based on VPlan. + /// If false, good old LV code. + bool canVectorize(bool UseVPlanNativePath); + + /// Returns the primary induction variable. + PHINode *getPrimaryInduction() { return PrimaryInduction; } + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + + /// Return the first-order recurrences found in the loop. + RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } + + /// Return the set of instructions to sink to handle first-order recurrences. + DenseMap &getSinkAfter() { return SinkAfter; } + + /// Returns the widest induction type. + Type *getWidestInductionType() { return WidestIndTy; } + + /// Returns True if V is a Phi node of an induction variable in this loop. + bool isInductionPhi(const Value *V); + + /// Returns True if V is a cast that is part of an induction def-use chain, + /// and had been proven to be redundant under a runtime guard (in other + /// words, the cast has the same SCEV expression as the induction phi). + bool isCastedInductionVariable(const Value *V); + + /// Returns True if V can be considered as an induction variable in this + /// loop. V can be the induction phi, or some redundant cast in the def-use + /// chain of the inducion phi. + bool isInductionVariable(const Value *V); + + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + + /// Returns True if Phi is a first-order recurrence in this loop. + bool isFirstOrderRecurrence(const PHINode *Phi); + + /// Return true if the block BB needs to be predicated in order for the loop + /// to be vectorized. + bool blockNeedsPredication(BasicBlock *BB); + + /// Check if this pointer is consecutive when vectorizing. This happens + /// when the last index of the GEP is the induction variable, or that the + /// pointer itself is an induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + /// Returns: + /// 0 - Stride is unknown or non-consecutive. + /// 1 - Address is consecutive. + /// -1 - Address is consecutive, and decreasing. + /// NOTE: This method must only be used before modifying the original scalar + /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). + int isConsecutivePtr(Value *Ptr); + + /// Returns true if the value V is uniform within the loop. + bool isUniform(Value *V); + + /// Returns the information that we collected about runtime memory check. + const RuntimePointerChecking *getRuntimePointerChecking() const { + return LAI->getRuntimePointerChecking(); + } + + const LoopAccessInfo *getLAI() const { return LAI; } + + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } + + uint64_t getMaxSafeRegisterWidth() const { + return LAI->getDepChecker().getMaxSafeRegisterWidth(); + } + + bool hasStride(Value *V) { return LAI->hasStride(V); } + + /// Returns true if vector representation of the instruction \p I + /// requires mask. + bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } + + unsigned getNumStores() const { return LAI->getNumStores(); } + unsigned getNumLoads() const { return LAI->getNumLoads(); } + + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + +private: + /// Return true if the pre-header, exiting and latch blocks of \p Lp and all + /// its nested loops are considered legal for vectorization. These legal + /// checks are common for inner and outer loop vectorization. + /// Temporarily taking UseVPlanNativePath parameter. If true, take + /// the new code path being implemented for outer loop vectorization + /// (should be functional for inner loop vectorization) based on VPlan. + /// If false, good old LV code. + bool canVectorizeLoopNestCFG(Loop *Lp, bool UseVPlanNativePath); + + /// Return true if the pre-header, exiting and latch blocks of \p Lp + /// (non-recursive) are considered legal for vectorization. + /// Temporarily taking UseVPlanNativePath parameter. If true, take + /// the new code path being implemented for outer loop vectorization + /// (should be functional for inner loop vectorization) based on VPlan. + /// If false, good old LV code. + bool canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath); + + /// Check if a single basic block loop is vectorizable. + /// At this point we know that this is a loop with a constant trip count + /// and we only need to check individual instructions. + bool canVectorizeInstrs(); + + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if the loop is vectorizable + bool canVectorizeMemory(); + + /// Return true if we can vectorize this loop using the IF-conversion + /// transformation. + bool canVectorizeWithIfConvert(); + + /// Return true if we can vectorize this outer loop. The method performs + /// specific checks for outer loop vectorization. + bool canVectorizeOuterLoop(); + + /// Return true if all of the instructions in the block can be speculatively + /// executed. \p SafePtrs is a list of addresses that are known to be legal + /// and we know that we can read from them without segfault. + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); + + /// Updates the vectorization state by adding \p Phi to the inductions list. + /// This can set \p Phi as the main induction of the loop if \p Phi is a + /// better choice for the main induction than the existing one. + void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, + SmallPtrSetImpl &AllowedExit); + + /// Create an analysis remark that explains why vectorization failed + /// + /// \p RemarkName is the identifier for the remark. If \p I is passed it is + /// an instruction that prevents vectorization. Otherwise the loop is used + /// for the location of the remark. \return the remark object that can be + /// streamed to. + OptimizationRemarkAnalysis + createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { + return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), + RemarkName, TheLoop, I); + } + + /// \brief If an access has a symbolic strides, this maps the pointer value to + /// the stride symbol. + const ValueToValueMap *getSymbolicStrides() { + // FIXME: Currently, the set of symbolic strides is sometimes queried before + // it's collected. This happens from canVectorizeWithIfConvert, when the + // pointer is checked to reference consecutive elements suitable for a + // masked access. + return LAI ? &LAI->getSymbolicStrides() : nullptr; + } + + /// The loop that we evaluate. + Loop *TheLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; + + /// Target Library Info. + TargetLibraryInfo *TLI; + + /// Dominator Tree. + DominatorTree *DT; + + // LoopAccess analysis. + std::function *GetLAA; + + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI = nullptr; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + + // --- vectorization state --- // + + /// Holds the primary induction variable. This is the counter of the + /// loop. + PHINode *PrimaryInduction = nullptr; + + /// Holds the reduction variables. + ReductionList Reductions; + + /// Holds all of the induction variables that we found in the loop. + /// Notice that inductions don't need to start at zero and that induction + /// variables can be pointers. + InductionList Inductions; + + /// Holds all the casts that participate in the update chain of the induction + /// variables, and that have been proven to be redundant (possibly under a + /// runtime guard). These casts can be ignored when creating the vectorized + /// loop body. + SmallPtrSet InductionCastsToIgnore; + + /// Holds the phi nodes that are first-order recurrences. + RecurrenceSet FirstOrderRecurrences; + + /// Holds instructions that need to sink past other instructions to handle + /// first-order recurrences. + DenseMap SinkAfter; + + /// Holds the widest induction type encountered. + Type *WidestIndTy = nullptr; + + /// Allowed outside users. This holds the induction and reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet AllowedExit; + + /// Can we assume the absence of NaNs. + bool HasFunNoNaNAttr = false; + + /// Vectorization requirements that will go through late-evaluation. + LoopVectorizationRequirements *Requirements; + + /// Used to emit an analysis of any legality issues. + LoopVectorizeHints *Hints; + + /// The demanded bits analsyis is used to compute the minimum type size in + /// which a reduction can be computed. + DemandedBits *DB; + + /// The assumption cache analysis is used to compute the minimum type size in + /// which a reduction can be computed. + AssumptionCache *AC; + + /// While vectorizing these instructions we have to generate a + /// call to the appropriate masked intrinsic + SmallPtrSet MaskedOp; +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONLEGALITY_H Index: llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt +++ llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMVectorize LoadStoreVectorizer.cpp + LoopVectorizationLegality.cpp LoopVectorize.cpp SLPVectorizer.cpp Vectorize.cpp Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -0,0 +1,1068 @@ +//===- LoopVectorizationLegality.cpp --------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides loop vectorization legality analysis. Original code +// resided in LoopVectorize.cpp for a long time. +// +// At this point, it is implemented as a utility class, not as an analysis +// pass. It should be easy to create an analysis pass around it if there +// is a need (but D45420 needs to happen first). +// +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/IntrinsicInst.h" + +using namespace llvm; + +#define LV_NAME "loop-vectorize" +#define DEBUG_TYPE LV_NAME + +static cl::opt + EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, + cl::desc("Enable if-conversion during vectorization.")); + +static cl::opt PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + +static cl::opt VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + +/// Maximum vectorization interleave count. +static const unsigned MaxInterleaveFactor = 16; + +namespace llvm { + +OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, + StringRef RemarkName, + Loop *TheLoop, + Instruction *I) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + +bool LoopVectorizeHints::Hint::validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + case HK_ISVECTORIZED: + return (Val == 0 || Val == 1); + } + return false; +} + +LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool DisableInterleaving, + OptimizationRemarkEmitter &ORE) + : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + + // force-vector-interleave overrides DisableInterleaving. + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; + + if (IsVectorized.Value != 1) + // If the vectorization width and interleaving count are both 1 then + // consider the loop to have been already vectorized because there's + // nothing more that we can do. + IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); +} + +bool LoopVectorizeHints::allowVectorization(Function *F, Loop *L, + bool AlwaysVectorize) const { + if (getForce() == LoopVectorizeHints::FK_Disabled) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitRemarkWithHints(); + return false; + } + + if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitRemarkWithHints(); + return false; + } + + if (getIsVectorized() == 1) { + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + // FIXME: Add interleave.disable metadata. This will allow + // vectorize.disable to be used without disabling the pass and errors + // to differentiate between disabled vectorization and a width of 1. + ORE.emit([&]() { + return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), + "AllDisabled", L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: vectorization and interleaving are " + "explicitly disabled, or the loop has already been " + "vectorized"; + }); + return false; + } + + return true; +} + +void LoopVectorizeHints::emitRemarkWithHints() const { + using namespace ore; + + ORE.emit([&]() { + if (Force.Value == LoopVectorizeHints::FK_Disabled) + return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "loop not vectorized: vectorization is explicitly disabled"; + else { + OptimizationRemarkMissed R(LV_NAME, "MissedDetails", + TheLoop->getStartLoc(), TheLoop->getHeader()); + R << "loop not vectorized"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=" << NV("Force", true); + if (Width.Value != 0) + R << ", Vector Width=" << NV("VectorWidth", Width.Value); + if (Interleave.Value != 0) + R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value); + R << ")"; + } + return R; + } + }); +} + +const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { + if (getWidth() == 1) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Disabled) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + return LV_NAME; + return OptimizationRemarkAnalysis::AlwaysPrint; +} + +void LoopVectorizeHints::getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } +} + +void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) + return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } + } +} + +MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name, + unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = { + MDString::get(Context, Name), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node, + ArrayRef HintTypes) { + MDString *Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; +} + +void LoopVectorizeHints::writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.empty()) + return; + + // Reserve the first element to LoopID (see below). + SmallVector MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); +} + +bool LoopVectorizationRequirements::doesNotMeet( + Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *PassName = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + if (UnsafeAlgebraInst && !Hints.allowReordering()) { + ORE.emit([&]() { + return OptimizationRemarkAnalysisFPCommute( + PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(), + UnsafeAlgebraInst->getParent()) + << "loop not vectorized: cannot prove it is safe to reorder " + "floating-point operations"; + }); + Failed = true; + } + + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + ORE.emit([&]() { + return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", + L->getStartLoc(), + L->getHeader()) + << "loop not vectorized: cannot prove it is safe to reorder " + "memory operations"; + }); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; + } + + return Failed; +} + +// Return true if the inner loop \p Lp is uniform with regard to the outer loop +// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes +// executing the inner loop will execute the same iterations). This check is +// very constrained for now but it will be relaxed in the future. \p Lp is +// considered uniform if it meets all the following conditions: +// 1) it has a canonical IV (starting from 0 and with stride 1), +// 2) its latch terminator is a conditional branch and, +// 3) its latch condition is a compare instruction whose operands are the +// canonical IV and an OuterLp invariant. +// This check doesn't take into account the uniformity of other conditions not +// related to the loop latch because they don't affect the loop uniformity. +// +// NOTE: We decided to keep all these checks and its associated documentation +// together so that we can easily have a picture of the current supported loop +// nests. However, some of the current checks don't depend on \p OuterLp and +// would be redundantly executed for each \p Lp if we invoked this function for +// different candidate outer loops. This is not the case for now because we +// don't currently have the infrastructure to evaluate multiple candidate outer +// loops and \p OuterLp will be a fixed parameter while we only support explicit +// outer loop vectorization. It's also very likely that these checks go away +// before introducing the aforementioned infrastructure. However, if this is not +// the case, we should move the \p OuterLp independent checks to a separate +// function that is only executed once for each \p Lp. +static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { + assert(Lp->getLoopLatch() && "Expected loop with a single latch."); + + // If Lp is the outer loop, it's uniform by definition. + if (Lp == OuterLp) + return true; + assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); + + // 1. + PHINode *IV = Lp->getCanonicalInductionVariable(); + if (!IV) { + DEBUG(dbgs() << "LV: Canonical IV not found.\n"); + return false; + } + + // 2. + BasicBlock *Latch = Lp->getLoopLatch(); + auto *LatchBr = dyn_cast(Latch->getTerminator()); + if (!LatchBr || LatchBr->isUnconditional()) { + DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); + return false; + } + + // 3. + auto *LatchCmp = dyn_cast(LatchBr->getCondition()); + if (!LatchCmp) { + DEBUG(dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); + return false; + } + + Value *CondOp0 = LatchCmp->getOperand(0); + Value *CondOp1 = LatchCmp->getOperand(1); + Value *IVUpdate = IV->getIncomingValueForBlock(Latch); + if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && + !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { + DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); + return false; + } + + return true; +} + +// Return true if \p Lp and all its nested loops are uniform with regard to \p +// OuterLp. +static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { + if (!isUniformLoop(Lp, OuterLp)) + return false; + + // Check if nested loops are uniform. + for (Loop *SubLp : *Lp) + if (!isUniformLoopNest(SubLp, OuterLp)) + return false; + + return true; +} + +/// \brief Check whether it is safe to if-convert this phi node. +/// +/// Phi nodes with constant expressions that can trap are not safe to if +/// convert. +static bool canIfConvertPHINodes(BasicBlock *BB) { + for (PHINode &Phi : BB->phis()) { + for (Value *V : Phi.incoming_values()) + if (auto *C = dyn_cast(V)) + if (C->canTrap()) + return false; + } + return true; +} + +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { + if (Ty->isPointerTy()) + return DL.getIntPtrType(Ty); + + // It is possible that char's or short's overflow when we ask for the loop's + // trip count, work around this by changing the type size. + if (Ty->getScalarSizeInBits() < 32) + return Type::getInt32Ty(Ty->getContext()); + + return Ty; +} + +static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { + Ty0 = convertPointerToIntegerType(DL, Ty0); + Ty1 = convertPointerToIntegerType(DL, Ty1); + if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) + return Ty0; + return Ty1; +} + +/// \brief Check that the instruction has outside loop users and is not an +/// identified reduction variable. +static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, + SmallPtrSetImpl &AllowedExit) { + // Reduction and Induction instructions are allowed to have exit users. All + // other instructions must not have external users. + if (!AllowedExit.count(Inst)) + // Check that all of the users of the loop are inside the BB. + for (User *U : Inst->users()) { + Instruction *UI = cast(U); + // This user may be a reduction exit value. + if (!TheLoop->contains(UI)) { + DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); + return true; + } + } + return false; +} + +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + const ValueToValueMap &Strides = + getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); + + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + if (Stride == 1 || Stride == -1) + return Stride; + return 0; +} + +bool LoopVectorizationLegality::isUniform(Value *V) { + return LAI->isUniform(V); +} + +bool LoopVectorizationLegality::canVectorizeOuterLoop() { + assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + + for (BasicBlock *BB : TheLoop->blocks()) { + // Check whether the BB terminator is a BranchInst. Any other terminator is + // not supported yet. + auto *Br = dyn_cast(BB->getTerminator()); + if (!Br) { + DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Check whether the BranchInst is a supported one. Only unconditional + // branches, conditional branches with an outer loop invariant condition or + // backedges are supported. + if (Br && Br->isConditional() && + !TheLoop->isLoopInvariant(Br->getCondition()) && + !LI->isLoopHeader(Br->getSuccessor(0)) && + !LI->isLoopHeader(Br->getSuccessor(1))) { + DEBUG(dbgs() << "LV: Unsupported conditional branch.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + } + + // Check whether inner loops are uniform. At this point, we only support + // simple outer loops scenarios with uniform nested loops. + if (!isUniformLoopNest(TheLoop /*loop nest*/, + TheLoop /*context outer loop*/)) { + DEBUG(dbgs() + << "LV: Not vectorizing: Outer loop contains divergent loops.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +void LoopVectorizationLegality::addInductionPhi( + PHINode *Phi, const InductionDescriptor &ID, + SmallPtrSetImpl &AllowedExit) { + Inductions[Phi] = ID; + + // In case this induction also comes with casts that we know we can ignore + // in the vectorized loop body, record them here. All casts could be recorded + // here for ignoring, but suffices to record only the first (as it is the + // only one that may bw used outside the cast sequence). + const SmallVectorImpl &Casts = ID.getCastInsts(); + if (!Casts.empty()) + InductionCastsToIgnore.insert(*Casts.begin()); + + Type *PhiTy = Phi->getType(); + const DataLayout &DL = Phi->getModule()->getDataLayout(); + + // Get the widest type. + if (!PhiTy->isFloatingPointTy()) { + if (!WidestIndTy) + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); + else + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); + } + + // Int inductions are special because we only allow one IV. + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && + isa(ID.getStartValue()) && + cast(ID.getStartValue())->isNullValue()) { + + // Use the phi node with the widest type as induction. Use the last + // one if there are multiple (no good reason for doing this other + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. + if (!PrimaryInduction || PhiTy == WidestIndTy) + PrimaryInduction = Phi; + } + + // Both the PHI node itself, and the "post-increment" value feeding + // back into the PHI node may have external users. + // We can allow those uses, except if the SCEVs we have for them rely + // on predicates that only hold within the loop, since allowing the exit + // currently means re-using this SCEV outside the loop. + if (PSE.getUnionPredicate().isAlwaysTrue()) { + AllowedExit.insert(Phi); + AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); + } + + DEBUG(dbgs() << "LV: Found an induction variable.\n"); +} + +bool LoopVectorizationLegality::canVectorizeInstrs() { + BasicBlock *Header = TheLoop->getHeader(); + + // Look for the attribute signaling the absence of NaNs. + Function &F = *Header->getParent(); + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; + + // For each block in the loop. + for (BasicBlock *BB : TheLoop->blocks()) { + // Scan the instructions in the block and look for hazards. + for (Instruction &I : *BB) { + if (auto *Phi = dyn_cast(&I)) { + Type *PhiTy = Phi->getType(); + // Check that this PHI type is allowed. + if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && + !PhiTy->isPointerTy()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "loop control flow is not understood by vectorizer"); + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + return false; + } + + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (BB != Header) { + // Check that this instruction has no outside users or is an + // identified reduction value with an outside user. + if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) + continue; + ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) + << "value could not be identified as " + "an induction or reduction variable"); + return false; + } + + // We only allow if-converted PHIs with exactly two incoming values. + if (Phi->getNumIncomingValues() != 2) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) + << "control flow not understood by vectorizer"); + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } + + RecurrenceDescriptor RedDes; + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, + DT)) { + if (RedDes.hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); + AllowedExit.insert(RedDes.getLoopExitInstr()); + Reductions[Phi] = RedDes; + continue; + } + + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { + addInductionPhi(Phi, ID, AllowedExit); + if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) + Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); + continue; + } + + if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, + SinkAfter, DT)) { + FirstOrderRecurrences.insert(Phi); + continue; + } + + // As a last resort, coerce the PHI to a AddRec expression + // and re-try classifying it a an induction PHI. + if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { + addInductionPhi(Phi, ID, AllowedExit); + continue; + } + + ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) + << "value that could not be identified as " + "reduction is used outside the loop"); + DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); + return false; + } // end of PHI handling + + // We handle calls that: + // * Are debug info intrinsics. + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. + auto *CI = dyn_cast(&I); + if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && + !isa(CI) && + !(CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { + ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) + << "call instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); + return false; + } + + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && hasVectorInstrinsicScalarOpd( + getVectorIntrinsicIDForCall(CI, TLI), 1)) { + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { + ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) + << "intrinsic instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + + // Check that the instruction return type is vectorizable. + // Also, we can't vectorize extractelement instructions. + if ((!VectorType::isValidElementType(I.getType()) && + !I.getType()->isVoidTy()) || + isa(I)) { + ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) + << "instruction return type cannot be vectorized"); + DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); + return false; + } + + // Check that the stored type is vectorizable. + if (auto *ST = dyn_cast(&I)) { + Type *T = ST->getValueOperand()->getType(); + if (!VectorType::isValidElementType(T)) { + ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) + << "store instruction cannot be vectorized"); + return false; + } + + // FP instructions can allow unsafe algebra, thus vectorizable by + // non-IEEE-754 compliant SIMD units. + // This applies to floating-point math operations and calls, not memory + // operations, shuffles, or casts, as they don't change precision or + // semantics. + } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && + !I.isFast()) { + DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); + Hints->setPotentiallyUnsafe(); + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { + ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) + << "value cannot be used outside the loop"); + return false; + } + } // next instr. + } + + if (!PrimaryInduction) { + DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + if (Inductions.empty()) { + ORE->emit(createMissedAnalysis("NoInductionVariable") + << "loop induction variable could not be identified"); + return false; + } + } + + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) + PrimaryInduction = nullptr; + + return true; +} + +bool LoopVectorizationLegality::canVectorizeMemory() { + LAI = &(*GetLAA)(*TheLoop); + const OptimizationRemarkAnalysis *LAR = LAI->getReport(); + if (LAR) { + ORE->emit([&]() { + return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), + "loop not vectorized: ", *LAR); + }); + } + if (!LAI->canVectorizeMemory()) + return false; + + if (LAI->hasStoreToLoopInvariantAddress()) { + ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") + << "write to a loop invariant address could not be vectorized"); + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + PSE.addPredicate(LAI->getPSE().getUnionPredicate()); + + return true; +} + +bool LoopVectorizationLegality::isInductionPhi(const Value *V) { + Value *In0 = const_cast(V); + PHINode *PN = dyn_cast_or_null(In0); + if (!PN) + return false; + + return Inductions.count(PN); +} + +bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { + auto *Inst = dyn_cast(V); + return (Inst && InductionCastsToIgnore.count(Inst)); +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + return isInductionPhi(V) || isCastedInductionVariable(V); +} + +bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { + return FirstOrderRecurrences.count(Phi); +} + +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); +} + +bool LoopVectorizationLegality::blockCanBePredicated( + BasicBlock *BB, SmallPtrSetImpl &SafePtrs) { + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + + for (Instruction &I : *BB) { + // Check that we don't have a constant expression that can trap as operand. + for (Value *Operand : I.operands()) { + if (auto *C = dyn_cast(Operand)) + if (C->canTrap()) + return false; + } + // We might be able to hoist the load. + if (I.mayReadFromMemory()) { + auto *LI = dyn_cast(&I); + if (!LI) + return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + // !llvm.mem.parallel_loop_access implies if-conversion safety. + // Otherwise, record that the load needs (real or emulated) masking + // and let the cost model decide. + if (!IsAnnotatedParallel) + MaskedOp.insert(LI); + continue; + } + } + + if (I.mayWriteToMemory()) { + auto *SI = dyn_cast(&I); + if (!SI) + return false; + // Predicated store requires some form of masking: + // 1) masked store HW instruction, + // 2) emulation via load-blend-store (only if safe and legal to do so, + // be aware on the race conditions), or + // 3) element-by-element predicate check and scalar store. + MaskedOp.insert(SI); + continue; + } + if (I.mayThrow()) + return false; + } + + return true; +} + +bool LoopVectorizationLegality::canVectorizeWithIfConvert() { + if (!EnableIfConversion) { + ORE->emit(createMissedAnalysis("IfConversionDisabled") + << "if-conversion is disabled"); + return false; + } + + assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); + + // A list of pointers that we can safely read and write to. + SmallPtrSet SafePointes; + + // Collect safe addresses. + for (BasicBlock *BB : TheLoop->blocks()) { + if (blockNeedsPredication(BB)) + continue; + + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) + SafePointes.insert(Ptr); + } + + // Collect the blocks that need predication. + BasicBlock *Header = TheLoop->getHeader(); + for (BasicBlock *BB : TheLoop->blocks()) { + // We don't support switch statements inside loops. + if (!isa(BB->getTerminator())) { + ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) + << "loop contains a switch statement"); + return false; + } + + // We must be able to predicate all blocks that need to be predicated. + if (blockNeedsPredication(BB)) { + if (!blockCanBePredicated(BB, SafePointes)) { + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); + return false; + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) + << "control flow cannot be substituted for a select"); + return false; + } + } + + // We can if-convert this loop. + return true; +} + +// Helper function to canVectorizeLoopNestCFG. +bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, + bool UseVPlanNativePath) { + assert((UseVPlanNativePath || Lp->empty()) && + "VPlan-native path is not enabled."); + + // TODO: ORE should be improved to show more accurate information when an + // outer loop can't be vectorized because a nested loop is not understood or + // legal. Something like: "outer_loop_location: loop not vectorized: + // (inner_loop_location) loop control flow is not understood by vectorizer". + + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!Lp->getLoopPreheader()) { + DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We must have a single backedge. + if (Lp->getNumBackEdges() != 1) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We must have a single exiting block. + if (!Lp->getExitingBlock()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We only handle bottom-tested loops, i.e. loop in which the condition is + // checked at the end of each iteration. With that we can assume that all + // instructions in the loop are executed the same number of times. + if (Lp->getExitingBlock() != Lp->getLoopLatch()) { + ORE->emit(createMissedAnalysis("CFGNotUnderstood") + << "loop control flow is not understood by vectorizer"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +bool LoopVectorizationLegality::canVectorizeLoopNestCFG( + Loop *Lp, bool UseVPlanNativePath) { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Recursively check whether the loop control flow of nested loops is + // understood. + for (Loop *SubLp : *Lp) + if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + return Result; +} + +bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { + // Store the result and return it at the end instead of exiting early, in case + // allowExtraAnalysis is used to report multiple reasons for not vectorizing. + bool Result = true; + + bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); + // Check whether the loop-related control flow in the loop nest is expected by + // vectorizer. + if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) { + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // We need to have a loop header. + DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() + << '\n'); + + // Specific checks for outer loops. We skip the remaining legal checks at this + // point because they don't support outer loops. + if (!TheLoop->empty()) { + assert(UseVPlanNativePath && "VPlan-native path is not enabled."); + + if (!canVectorizeOuterLoop()) { + DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n"); + // TODO: Implement DoExtraAnalysis when subsequent legal checks support + // outer loops. + return false; + } + + DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); + return Result; + } + + assert(TheLoop->empty() && "Inner loop expected."); + // Check if we can if-convert non-single-bb loops. + unsigned NumBlocks = TheLoop->getNumBlocks(); + if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { + DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Check if we can vectorize the instructions and CFG in this loop. + if (!canVectorizeInstrs()) { + DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeMemory()) { + DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + DEBUG(dbgs() << "LV: We can vectorize this loop" + << (LAI->getRuntimePointerChecking()->Need + ? " (with a runtime bound check)" + : "") + << "!\n"); + + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { + ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + + // Okay! We've done all the tests. If any have failed, return false. Otherwise + // we can vectorize, and at this point we don't have any other mem analysis + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return Result; +} + +} // namespace llvm Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -131,6 +131,7 @@ #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include #include @@ -152,10 +153,6 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt - EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, - cl::desc("Enable if-conversion during vectorization.")); - /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt TinyTripCountVectorThreshold( @@ -191,9 +188,6 @@ "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); -/// Maximum vectorization interleave count. -static const unsigned MaxInterleaveFactor = 16; - static cl::opt ForceTargetMaxScalarInterleaveFactor( "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " @@ -245,57 +239,11 @@ cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); -static cl::opt PragmaVectorizeMemoryCheckThreshold( - "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum allowed number of runtime memory checks with a " - "vectorize(enable) pragma.")); - -static cl::opt VectorizeSCEVCheckThreshold( - "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed.")); - -static cl::opt PragmaVectorizeSCEVCheckThreshold( - "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, - cl::desc("The maximum number of SCEV checks allowed with a " - "vectorize(enable) pragma")); - static cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); -/// Create an analysis remark that explains why vectorization failed -/// -/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p -/// RemarkName is the identifier for the remark. If \p I is passed it is an -/// instruction that prevents vectorization. Otherwise \p TheLoop is used for -/// the location of the remark. \return the remark object that can be -/// streamed to. -static OptimizationRemarkAnalysis -createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, - Instruction *I = nullptr) { - Value *CodeRegion = TheLoop->getHeader(); - DebugLoc DL = TheLoop->getStartLoc(); - - if (I) { - CodeRegion = I->getParent(); - // If there is no debug location attached to the instruction, revert back to - // using the loop's. - if (I->getDebugLoc()) - DL = I->getDebugLoc(); - } - - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; -} - -namespace { - -class LoopVectorizationRequirements; - -} // end anonymous namespace - /// A helper function for converting Scalar types to vector types. /// If the incoming type is void, we return void. If the VF is 1, we return /// the scalar type. @@ -1177,315 +1125,6 @@ } }; -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char *Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char *Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) {} - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - case HK_ISVECTORIZED: - return (Val==0 || Val==1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - - /// Vectorization interleave factor. - Hint Interleave; - - /// Vectorization forced - Hint Force; - - /// Already Vectorized - Hint IsVectorized; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - - /// True if there is any unsafe math in the loop. - bool PotentiallyUnsafe = false; - -public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving, - OptimizationRemarkEmitter &ORE) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - if (IsVectorized.Value != 1) - // If the vectorization width and interleaving count are both 1 then - // consider the loop to have been already vectorized because there's - // nothing more that we can do. - IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1; - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - IsVectorized.Value = 1; - Hint Hints[] = {IsVectorized}; - writeHintsToMetadata(Hints); - } - - bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { - if (getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitRemarkWithHints(); - return false; - } - - if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitRemarkWithHints(); - return false; - } - - if (getIsVectorized() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - // FIXME: Add interleave.disable metadata. This will allow - // vectorize.disable to be used without disabling the pass and errors - // to differentiate between disabled vectorization and a width of 1. - ORE.emit([&]() { - return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), - "AllDisabled", L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: vectorization and interleaving are " - "explicitly disabled, or the loop has already been " - "vectorized"; - }); - return false; - } - - return true; - } - - /// Dumps all the hint information. - void emitRemarkWithHints() const { - using namespace ore; - - ORE.emit([&]() { - if (Force.Value == LoopVectorizeHints::FK_Disabled) - return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "loop not vectorized: vectorization is explicitly disabled"; - else { - OptimizationRemarkMissed R(LV_NAME, "MissedDetails", - TheLoop->getStartLoc(), - TheLoop->getHeader()); - R << "loop not vectorized"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=" << NV("Force", true); - if (Width.Value != 0) - R << ", Vector Width=" << NV("VectorWidth", Width.Value); - if (Interleave.Value != 0) - R << ", Interleave Count=" - << NV("InterleaveCount", Interleave.Value); - R << ")"; - } - return R; - } - }); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - unsigned getIsVectorized() const { return IsVectorized.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - - /// \brief If hints are provided that force vectorization, use the AlwaysPrint - /// pass name to force the frontend to print the diagnostic. - const char *vectorizeAnalysisPassName() const { - if (getWidth() == 1) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Disabled) - return LV_NAME; - if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) - return LV_NAME; - return OptimizationRemarkAnalysis::AlwaysPrint; - } - - bool allowReordering() const { - // When enabling loop hints are provided we allow the vectorizer to change - // the order of operations that is given by the scalar loop. This is not - // enabled by default because can be unsafe or inefficient. For example, - // reordering floating-point operations will change the way round-off - // error accumulates in the loop. - return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; - } - - bool isPotentiallyUnsafe() const { - // Avoid FP vectorization if the target is unsure about proper support. - // This may be related to the SIMD unit in the target not handling - // IEEE 754 FP ops properly, or bad single-to-double promotions. - // Otherwise, a sequence of vectorized loops, even without reduction, - // could lead to different end results on the destination vectors. - return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe; - } - - void setPotentiallyUnsafe() { PotentiallyUnsafe = true; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract(Arg); - if (!C) - return; - unsigned Val = C->getZExtValue(); - - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } - } - } - - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); - } - - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { - MDString *Name = dyn_cast(Node->getOperand(0)); - if (!Name) - return false; - - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } - - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef HintTypes) { - if (HintTypes.empty()) - return; - - // Reserve the first element to LoopID (see below). - SmallVector MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } - - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); - - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - TheLoop->setLoopID(NewLoopID); - } - - /// The loop these hints belong to. - const Loop *TheLoop; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; - } // end anonymous namespace static void emitMissedWarning(Function *F, Loop *L, @@ -1511,275 +1150,6 @@ namespace llvm { -/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and -/// to what vectorization factor. -/// This class does not look at the profitability of vectorization, only the -/// legality. This class has two main kinds of checks: -/// * Memory checks - The code in canVectorizeMemory checks if vectorization -/// will change the order of memory accesses in a way that will change the -/// correctness of the program. -/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory -/// checks for a number of different conditions, such as the availability of a -/// single induction variable, that all types are supported and vectorize-able, -/// etc. This code reflects the capabilities of InnerLoopVectorizer. -/// This class is also used by InnerLoopVectorizer for identifying -/// induction variable and the different reduction variables. -class LoopVectorizationLegality { -public: - LoopVectorizationLegality( - Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, - std::function *GetLAA, LoopInfo *LI, - OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, - LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) - : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA), - ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} - - /// ReductionList contains the reduction descriptors for all - /// of the reductions that were found in the loop. - using ReductionList = DenseMap; - - /// InductionList saves induction variables and maps them to the - /// induction descriptor. - using InductionList = MapVector; - - /// RecurrenceSet contains the phi nodes that are recurrences other than - /// inductions and reductions. - using RecurrenceSet = SmallPtrSet; - - /// Returns true if it is legal to vectorize this loop. - /// This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. - bool canVectorize(); - - /// Returns the primary induction variable. - PHINode *getPrimaryInduction() { return PrimaryInduction; } - - /// Returns the reduction variables found in the loop. - ReductionList *getReductionVars() { return &Reductions; } - - /// Returns the induction variables found in the loop. - InductionList *getInductionVars() { return &Inductions; } - - /// Return the first-order recurrences found in the loop. - RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; } - - /// Return the set of instructions to sink to handle first-order recurrences. - DenseMap &getSinkAfter() { return SinkAfter; } - - /// Returns the widest induction type. - Type *getWidestInductionType() { return WidestIndTy; } - - /// Returns True if V is a Phi node of an induction variable in this loop. - bool isInductionPhi(const Value *V); - - /// Returns True if V is a cast that is part of an induction def-use chain, - /// and had been proven to be redundant under a runtime guard (in other - /// words, the cast has the same SCEV expression as the induction phi). - bool isCastedInductionVariable(const Value *V); - - /// Returns True if V can be considered as an induction variable in this - /// loop. V can be the induction phi, or some redundant cast in the def-use - /// chain of the inducion phi. - bool isInductionVariable(const Value *V); - - /// Returns True if PN is a reduction variable in this loop. - bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } - - /// Returns True if Phi is a first-order recurrence in this loop. - bool isFirstOrderRecurrence(const PHINode *Phi); - - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - - /// Check if this pointer is consecutive when vectorizing. This happens - /// when the last index of the GEP is the induction variable, or that the - /// pointer itself is an induction variable. - /// This check allows us to vectorize A[idx] into a wide load/store. - /// Returns: - /// 0 - Stride is unknown or non-consecutive. - /// 1 - Address is consecutive. - /// -1 - Address is consecutive, and decreasing. - /// NOTE: This method must only be used before modifying the original scalar - /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). - int isConsecutivePtr(Value *Ptr); - - /// Returns true if the value V is uniform within the loop. - bool isUniform(Value *V); - - /// Returns the information that we collected about runtime memory check. - const RuntimePointerChecking *getRuntimePointerChecking() const { - return LAI->getRuntimePointerChecking(); - } - - const LoopAccessInfo *getLAI() const { return LAI; } - - unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } - - uint64_t getMaxSafeRegisterWidth() const { - return LAI->getDepChecker().getMaxSafeRegisterWidth(); - } - - bool hasStride(Value *V) { return LAI->hasStride(V); } - - /// Returns true if vector representation of the instruction \p I - /// requires mask. - bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } - - unsigned getNumStores() const { return LAI->getNumStores(); } - unsigned getNumLoads() const { return LAI->getNumLoads(); } - - // Returns true if the NoNaN attribute is set on the function. - bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } - -private: - /// Return true if the pre-header, exiting and latch blocks of \p Lp and all - /// its nested loops are considered legal for vectorization. These legal - /// checks are common for inner and outer loop vectorization. - bool canVectorizeLoopNestCFG(Loop *Lp); - - /// Return true if the pre-header, exiting and latch blocks of \p Lp - /// (non-recursive) are considered legal for vectorization. - bool canVectorizeLoopCFG(Loop *Lp); - - /// Check if a single basic block loop is vectorizable. - /// At this point we know that this is a loop with a constant trip count - /// and we only need to check individual instructions. - bool canVectorizeInstrs(); - - /// When we vectorize loops we may change the order in which - /// we read and write from memory. This method checks if it is - /// legal to vectorize the code, considering only memory constrains. - /// Returns true if the loop is vectorizable - bool canVectorizeMemory(); - - /// Return true if we can vectorize this loop using the IF-conversion - /// transformation. - bool canVectorizeWithIfConvert(); - - /// Return true if we can vectorize this outer loop. The method performs - /// specific checks for outer loop vectorization. - bool canVectorizeOuterLoop(); - - /// Return true if all of the instructions in the block can be speculatively - /// executed. \p SafePtrs is a list of addresses that are known to be legal - /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); - - /// Updates the vectorization state by adding \p Phi to the inductions list. - /// This can set \p Phi as the main induction of the loop if \p Phi is a - /// better choice for the main induction than the existing one. - void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl &AllowedExit); - - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. If \p I is passed it is - /// an instruction that prevents vectorization. Otherwise the loop is used - /// for the location of the remark. \return the remark object that can be - /// streamed to. - OptimizationRemarkAnalysis - createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop, I); - } - - /// \brief If an access has a symbolic strides, this maps the pointer value to - /// the stride symbol. - const ValueToValueMap *getSymbolicStrides() { - // FIXME: Currently, the set of symbolic strides is sometimes queried before - // it's collected. This happens from canVectorizeWithIfConvert, when the - // pointer is checked to reference consecutive elements suitable for a - // masked access. - return LAI ? &LAI->getSymbolicStrides() : nullptr; - } - - /// The loop that we evaluate. - Loop *TheLoop; - - /// Loop Info analysis. - LoopInfo *LI; - - /// A wrapper around ScalarEvolution used to add runtime SCEV checks. - /// Applies dynamic knowledge to simplify SCEV expressions in the context - /// of existing SCEV assumptions. The analysis will also add a minimal set - /// of new predicates if this is required to enable vectorization and - /// unrolling. - PredicatedScalarEvolution &PSE; - - /// Target Library Info. - TargetLibraryInfo *TLI; - - /// Dominator Tree. - DominatorTree *DT; - - // LoopAccess analysis. - std::function *GetLAA; - - // And the loop-accesses info corresponding to this loop. This pointer is - // null until canVectorizeMemory sets it up. - const LoopAccessInfo *LAI = nullptr; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter *ORE; - - // --- vectorization state --- // - - /// Holds the primary induction variable. This is the counter of the - /// loop. - PHINode *PrimaryInduction = nullptr; - - /// Holds the reduction variables. - ReductionList Reductions; - - /// Holds all of the induction variables that we found in the loop. - /// Notice that inductions don't need to start at zero and that induction - /// variables can be pointers. - InductionList Inductions; - - /// Holds all the casts that participate in the update chain of the induction - /// variables, and that have been proven to be redundant (possibly under a - /// runtime guard). These casts can be ignored when creating the vectorized - /// loop body. - SmallPtrSet InductionCastsToIgnore; - - /// Holds the phi nodes that are first-order recurrences. - RecurrenceSet FirstOrderRecurrences; - - /// Holds instructions that need to sink past other instructions to handle - /// first-order recurrences. - DenseMap SinkAfter; - - /// Holds the widest induction type encountered. - Type *WidestIndTy = nullptr; - - /// Allowed outside users. This holds the induction and reduction - /// vars which can be accessed from outside the loop. - SmallPtrSet AllowedExit; - - /// Can we assume the absence of NaNs. - bool HasFunNoNaNAttr = false; - - /// Vectorization requirements that will go through late-evaluation. - LoopVectorizationRequirements *Requirements; - - /// Used to emit an analysis of any legality issues. - LoopVectorizeHints *Hints; - - /// The demanded bits analsyis is used to compute the minimum type size in - /// which a reduction can be computed. - DemandedBits *DB; - - /// The assumption cache analysis is used to compute the minimum type size in - /// which a reduction can be computed. - AssumptionCache *AC; - - /// While vectorizing these instructions we have to generate a - /// call to the appropriate masked intrinsic - SmallPtrSet MaskedOp; -}; - /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -2117,7 +1487,7 @@ /// \p RemarkName is the identifier for the remark. \return the remark object /// that can be streamed to. OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(), + return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), RemarkName, TheLoop); } @@ -2232,78 +1602,6 @@ } // end namespace llvm -namespace { - -/// \brief This holds vectorization requirements that must be verified late in -/// the process. The requirements are set by legalize and costmodel. Once -/// vectorization has been determined to be possible and profitable the -/// requirements can be verified by looking for metadata or compiler options. -/// For example, some loops require FP commutativity which is only allowed if -/// vectorization is explicitly specified or if the fast-math compiler option -/// has been provided. -/// Late evaluation of these requirements allows helpful diagnostics to be -/// composed that tells the user what need to be done to vectorize the loop. For -/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late -/// evaluation should be used only when diagnostics can generated that can be -/// followed by a non-expert user. -class LoopVectorizationRequirements { -public: - LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {} - - void addUnsafeAlgebraInst(Instruction *I) { - // First unsafe algebra instruction. - if (!UnsafeAlgebraInst) - UnsafeAlgebraInst = I; - } - - void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } - - bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { - const char *PassName = Hints.vectorizeAnalysisPassName(); - bool Failed = false; - if (UnsafeAlgebraInst && !Hints.allowReordering()) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisFPCommute( - PassName, "CantReorderFPOps", - UnsafeAlgebraInst->getDebugLoc(), - UnsafeAlgebraInst->getParent()) - << "loop not vectorized: cannot prove it is safe to reorder " - "floating-point operations"; - }); - Failed = true; - } - - // Test if runtime memcheck thresholds are exceeded. - bool PragmaThresholdReached = - NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; - bool ThresholdReached = - NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; - if ((ThresholdReached && !Hints.allowReordering()) || - PragmaThresholdReached) { - ORE.emit([&]() { - return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps", - L->getStartLoc(), - L->getHeader()) - << "loop not vectorized: cannot prove it is safe to reorder " - "memory operations"; - }); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - Failed = true; - } - - return Failed; - } - -private: - unsigned NumRuntimePointerChecks = 0; - Instruction *UnsafeAlgebraInst = nullptr; - - /// Interface to emit optimization remarks. - OptimizationRemarkEmitter &ORE; -}; - -} // end anonymous namespace - // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2767,20 +2065,6 @@ } } -int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { - const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : - ValueToValueMap(); - - int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); - if (Stride == 1 || Stride == -1) - return Stride; - return 0; -} - -bool LoopVectorizationLegality::isUniform(Value *V) { - return LAI->isUniform(V); -} - Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); @@ -4839,644 +4123,6 @@ assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } -/// \brief Check whether it is safe to if-convert this phi node. -/// -/// Phi nodes with constant expressions that can trap are not safe to if -/// convert. -static bool canIfConvertPHINodes(BasicBlock *BB) { - for (PHINode &Phi : BB->phis()) { - for (Value *V : Phi.incoming_values()) - if (auto *C = dyn_cast(V)) - if (C->canTrap()) - return false; - } - return true; -} - -bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) { - ORE->emit(createMissedAnalysis("IfConversionDisabled") - << "if-conversion is disabled"); - return false; - } - - assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - - // A list of pointers that we can safely read and write to. - SmallPtrSet SafePointes; - - // Collect safe addresses. - for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) - continue; - - for (Instruction &I : *BB) - if (auto *Ptr = getLoadStorePointerOperand(&I)) - SafePointes.insert(Ptr); - } - - // Collect the blocks that need predication. - BasicBlock *Header = TheLoop->getHeader(); - for (BasicBlock *BB : TheLoop->blocks()) { - // We don't support switch statements inside loops. - if (!isa(BB->getTerminator())) { - ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator()) - << "loop contains a switch statement"); - return false; - } - - // We must be able to predicate all blocks that need to be predicated. - if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } else if (BB != Header && !canIfConvertPHINodes(BB)) { - ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator()) - << "control flow cannot be substituted for a select"); - return false; - } - } - - // We can if-convert this loop. - return true; -} - -// Helper function to canVectorizeLoopNestCFG. -bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp) { - assert((EnableVPlanNativePath || Lp->empty()) && - "VPlan-native path is not enabled."); - - // TODO: ORE should be improved to show more accurate information when an - // outer loop can't be vectorized because a nested loop is not understood or - // legal. Something like: "outer_loop_location: loop not vectorized: - // (inner_loop_location) loop control flow is not understood by vectorizer". - - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - - // We must have a loop in canonical form. Loops with indirectbr in them cannot - // be canonicalized. - if (!Lp->getLoopPreheader()) { - DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single backedge. - if (Lp->getNumBackEdges() != 1) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We must have a single exiting block. - if (!Lp->getExitingBlock()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We only handle bottom-tested loops, i.e. loop in which the condition is - // checked at the end of each iteration. With that we can assume that all - // instructions in the loop are executed the same number of times. - if (Lp->getExitingBlock() != Lp->getLoopLatch()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - return Result; -} - -bool LoopVectorizationLegality::canVectorizeLoopNestCFG(Loop *Lp) { - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - if (!canVectorizeLoopCFG(Lp)) { - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Recursively check whether the loop control flow of nested loops is - // understood. - for (Loop *SubLp : *Lp) - if (!canVectorizeLoopNestCFG(SubLp)) { - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - return Result; -} - -bool LoopVectorizationLegality::canVectorize() { - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - // Check whether the loop-related control flow in the loop nest is expected by - // vectorizer. - if (!canVectorizeLoopNestCFG(TheLoop)) { - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // We need to have a loop header. - DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() - << '\n'); - - // Specific checks for outer loops. We skip the remaining legal checks at this - // point because they don't support outer loops. - if (!TheLoop->empty()) { - assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - - if (!canVectorizeOuterLoop()) { - DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n"); - // TODO: Implement DoExtraAnalysis when subsequent legal checks support - // outer loops. - return false; - } - - DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n"); - return Result; - } - - assert(TheLoop->empty() && "Inner loop expected."); - // Check if we can if-convert non-single-bb loops. - unsigned NumBlocks = TheLoop->getNumBlocks(); - if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { - DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Check if we can vectorize the instructions and CFG in this loop. - if (!canVectorizeInstrs()) { - DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Go over each instruction and look at memory deps. - if (!canVectorizeMemory()) { - DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - DEBUG(dbgs() << "LV: We can vectorize this loop" - << (LAI->getRuntimePointerChecking()->Need - ? " (with a runtime bound check)" - : "") - << "!\n"); - - unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; - if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) - SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; - - if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { - ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks") - << "Too many SCEV assumptions need to be made and checked " - << "at runtime"); - DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Okay! We've done all the tests. If any have failed, return false. Otherwise - // we can vectorize, and at this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return true with - // no restrictions. - return Result; -} - -// Return true if the inner loop \p Lp is uniform with regard to the outer loop -// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes -// executing the inner loop will execute the same iterations). This check is -// very constrained for now but it will be relaxed in the future. \p Lp is -// considered uniform if it meets all the following conditions: -// 1) it has a canonical IV (starting from 0 and with stride 1), -// 2) its latch terminator is a conditional branch and, -// 3) its latch condition is a compare instruction whose operands are the -// canonical IV and an OuterLp invariant. -// This check doesn't take into account the uniformity of other conditions not -// related to the loop latch because they don't affect the loop uniformity. -// -// NOTE: We decided to keep all these checks and its associated documentation -// together so that we can easily have a picture of the current supported loop -// nests. However, some of the current checks don't depend on \p OuterLp and -// would be redundantly executed for each \p Lp if we invoked this function for -// different candidate outer loops. This is not the case for now because we -// don't currently have the infrastructure to evaluate multiple candidate outer -// loops and \p OuterLp will be a fixed parameter while we only support explicit -// outer loop vectorization. It's also very likely that these checks go away -// before introducing the aforementioned infrastructure. However, if this is not -// the case, we should move the \p OuterLp independent checks to a separate -// function that is only executed once for each \p Lp. -static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { - assert(Lp->getLoopLatch() && "Expected loop with a single latch."); - - // If Lp is the outer loop, it's uniform by definition. - if (Lp == OuterLp) - return true; - assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); - - // 1. - PHINode *IV = Lp->getCanonicalInductionVariable(); - if (!IV) { - DEBUG(dbgs() << "LV: Canonical IV not found.\n"); - return false; - } - - // 2. - BasicBlock *Latch = Lp->getLoopLatch(); - auto *LatchBr = dyn_cast(Latch->getTerminator()); - if (!LatchBr || LatchBr->isUnconditional()) { - DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n"); - return false; - } - - // 3. - auto *LatchCmp = dyn_cast(LatchBr->getCondition()); - if (!LatchCmp) { - DEBUG(dbgs() << "LV: Loop latch condition is not a compare instruction.\n"); - return false; - } - - Value *CondOp0 = LatchCmp->getOperand(0); - Value *CondOp1 = LatchCmp->getOperand(1); - Value *IVUpdate = IV->getIncomingValueForBlock(Latch); - if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) && - !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) { - DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n"); - return false; - } - - return true; -} - -// Return true if \p Lp and all its nested loops are uniform with regard to \p -// OuterLp. -static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { - if (!isUniformLoop(Lp, OuterLp)) - return false; - - // Check if nested loops are uniform. - for (Loop *SubLp : *Lp) - if (!isUniformLoopNest(SubLp, OuterLp)) - return false; - - return true; -} - -bool LoopVectorizationLegality::canVectorizeOuterLoop() { - assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); - // Store the result and return it at the end instead of exiting early, in case - // allowExtraAnalysis is used to report multiple reasons for not vectorizing. - bool Result = true; - bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - - for (BasicBlock *BB : TheLoop->blocks()) { - // Check whether the BB terminator is a BranchInst. Any other terminator is - // not supported yet. - auto *Br = dyn_cast(BB->getTerminator()); - if (!Br) { - DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - // Check whether the BranchInst is a supported one. Only unconditional - // branches, conditional branches with an outer loop invariant condition or - // backedges are supported. - if (Br && Br->isConditional() && - !TheLoop->isLoopInvariant(Br->getCondition()) && - !LI->isLoopHeader(Br->getSuccessor(0)) && - !LI->isLoopHeader(Br->getSuccessor(1))) { - DEBUG(dbgs() << "LV: Unsupported conditional branch.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - } - - // Check whether inner loops are uniform. At this point, we only support - // simple outer loops scenarios with uniform nested loops. - if (!isUniformLoopNest(TheLoop /*loop nest*/, - TheLoop /*context outer loop*/)) { - DEBUG(dbgs() - << "LV: Not vectorizing: Outer loop contains divergent loops.\n"); - ORE->emit(createMissedAnalysis("CFGNotUnderstood") - << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) - Result = false; - else - return false; - } - - return Result; -} - -static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { - if (Ty->isPointerTy()) - return DL.getIntPtrType(Ty); - - // It is possible that char's or short's overflow when we ask for the loop's - // trip count, work around this by changing the type size. - if (Ty->getScalarSizeInBits() < 32) - return Type::getInt32Ty(Ty->getContext()); - - return Ty; -} - -static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { - Ty0 = convertPointerToIntegerType(DL, Ty0); - Ty1 = convertPointerToIntegerType(DL, Ty1); - if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) - return Ty0; - return Ty1; -} - -/// \brief Check that the instruction has outside loop users and is not an -/// identified reduction variable. -static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSetImpl &AllowedExit) { - // Reduction and Induction instructions are allowed to have exit users. All - // other instructions must not have external users. - if (!AllowedExit.count(Inst)) - // Check that all of the users of the loop are inside the BB. - for (User *U : Inst->users()) { - Instruction *UI = cast(U); - // This user may be a reduction exit value. - if (!TheLoop->contains(UI)) { - DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); - return true; - } - } - return false; -} - -void LoopVectorizationLegality::addInductionPhi( - PHINode *Phi, const InductionDescriptor &ID, - SmallPtrSetImpl &AllowedExit) { - Inductions[Phi] = ID; - - // In case this induction also comes with casts that we know we can ignore - // in the vectorized loop body, record them here. All casts could be recorded - // here for ignoring, but suffices to record only the first (as it is the - // only one that may bw used outside the cast sequence). - const SmallVectorImpl &Casts = ID.getCastInsts(); - if (!Casts.empty()) - InductionCastsToIgnore.insert(*Casts.begin()); - - Type *PhiTy = Phi->getType(); - const DataLayout &DL = Phi->getModule()->getDataLayout(); - - // Get the widest type. - if (!PhiTy->isFloatingPointTy()) { - if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(DL, PhiTy); - else - WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); - } - - // Int inductions are special because we only allow one IV. - if (ID.getKind() == InductionDescriptor::IK_IntInduction && - ID.getConstIntStepValue() && - ID.getConstIntStepValue()->isOne() && - isa(ID.getStartValue()) && - cast(ID.getStartValue())->isNullValue()) { - - // Use the phi node with the widest type as induction. Use the last - // one if there are multiple (no good reason for doing this other - // than it is expedient). We've checked that it begins at zero and - // steps by one, so this is a canonical induction variable. - if (!PrimaryInduction || PhiTy == WidestIndTy) - PrimaryInduction = Phi; - } - - // Both the PHI node itself, and the "post-increment" value feeding - // back into the PHI node may have external users. - // We can allow those uses, except if the SCEVs we have for them rely - // on predicates that only hold within the loop, since allowing the exit - // currently means re-using this SCEV outside the loop. - if (PSE.getUnionPredicate().isAlwaysTrue()) { - AllowedExit.insert(Phi); - AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); - } - - DEBUG(dbgs() << "LV: Found an induction variable.\n"); -} - -bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *Header = TheLoop->getHeader(); - - // Look for the attribute signaling the absence of NaNs. - Function &F = *Header->getParent(); - HasFunNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - - // For each block in the loop. - for (BasicBlock *BB : TheLoop->blocks()) { - // Scan the instructions in the block and look for hazards. - for (Instruction &I : *BB) { - if (auto *Phi = dyn_cast(&I)) { - Type *PhiTy = Phi->getType(); - // Check that this PHI type is allowed. - if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && - !PhiTy->isPointerTy()) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "loop control flow is not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); - return false; - } - - // If this PHINode is not in the header block, then we know that we - // can convert it to select during if-conversion. No need to check if - // the PHIs in this block are induction or reduction variables. - if (BB != Header) { - // Check that this instruction has no outside users or is an - // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) - continue; - ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi) - << "value could not be identified as " - "an induction or reduction variable"); - return false; - } - - // We only allow if-converted PHIs with exactly two incoming values. - if (Phi->getNumIncomingValues() != 2) { - ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi) - << "control flow not understood by vectorizer"); - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - - RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, - DT)) { - if (RedDes.hasUnsafeAlgebra()) - Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); - AllowedExit.insert(RedDes.getLoopExitInstr()); - Reductions[Phi] = RedDes; - continue; - } - - InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) { - addInductionPhi(Phi, ID, AllowedExit); - if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr) - Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst()); - continue; - } - - if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, - SinkAfter, DT)) { - FirstOrderRecurrences.insert(Phi); - continue; - } - - // As a last resort, coerce the PHI to a AddRec expression - // and re-try classifying it a an induction PHI. - if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) { - addInductionPhi(Phi, ID, AllowedExit); - continue; - } - - ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi) - << "value that could not be identified as " - "reduction is used outside the loop"); - DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n"); - return false; - } // end of PHI handling - - // We handle calls that: - // * Are debug info intrinsics. - // * Have a mapping to an IR intrinsic. - // * Have a vector version available. - auto *CI = dyn_cast(&I); - if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && - !isa(CI) && - !(CI->getCalledFunction() && TLI && - TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - ORE->emit(createMissedAnalysis("CantVectorizeCall", CI) - << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); - return false; - } - - // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the - // second argument is the same (i.e. loop invariant) - if (CI && hasVectorInstrinsicScalarOpd( - getVectorIntrinsicIDForCall(CI, TLI), 1)) { - auto *SE = PSE.getSE(); - if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { - ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI) - << "intrinsic instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); - return false; - } - } - - // Check that the instruction return type is vectorizable. - // Also, we can't vectorize extractelement instructions. - if ((!VectorType::isValidElementType(I.getType()) && - !I.getType()->isVoidTy()) || - isa(I)) { - ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I) - << "instruction return type cannot be vectorized"); - DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); - return false; - } - - // Check that the stored type is vectorizable. - if (auto *ST = dyn_cast(&I)) { - Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) { - ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) - << "store instruction cannot be vectorized"); - return false; - } - - // FP instructions can allow unsafe algebra, thus vectorizable by - // non-IEEE-754 compliant SIMD units. - // This applies to floating-point math operations and calls, not memory - // operations, shuffles, or casts, as they don't change precision or - // semantics. - } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && - !I.isFast()) { - DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n"); - Hints->setPotentiallyUnsafe(); - } - - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) { - ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I) - << "value cannot be used outside the loop"); - return false; - } - } // next instr. - } - - if (!PrimaryInduction) { - DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) { - ORE->emit(createMissedAnalysis("NoInductionVariable") - << "loop induction variable could not be identified"); - return false; - } - } - - // Now we know the widest induction type, check if our found induction - // is the same size. If it's not, unset it here and InnerLoopVectorizer - // will create another. - if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) - PrimaryInduction = nullptr; - - return true; -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -5882,102 +4528,6 @@ Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::canVectorizeMemory() { - LAI = &(*GetLAA)(*TheLoop); - const OptimizationRemarkAnalysis *LAR = LAI->getReport(); - if (LAR) { - ORE->emit([&]() { - return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), - "loop not vectorized: ", *LAR); - }); - } - if (!LAI->canVectorizeMemory()) - return false; - - if (LAI->hasStoreToLoopInvariantAddress()) { - ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress") - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); - PSE.addPredicate(LAI->getPSE().getUnionPredicate()); - - return true; -} - -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { - Value *In0 = const_cast(V); - PHINode *PN = dyn_cast_or_null(In0); - if (!PN) - return false; - - return Inductions.count(PN); -} - -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { - auto *Inst = dyn_cast(V); - return (Inst && InductionCastsToIgnore.count(Inst)); -} - -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { - return isInductionPhi(V) || isCastedInductionVariable(V); -} - -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { - return FirstOrderRecurrences.count(Phi); -} - -bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); -} - -bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl &SafePtrs) { - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - - for (Instruction &I : *BB) { - // Check that we don't have a constant expression that can trap as operand. - for (Value *Operand : I.operands()) { - if (auto *C = dyn_cast(Operand)) - if (C->canTrap()) - return false; - } - // We might be able to hoist the load. - if (I.mayReadFromMemory()) { - auto *LI = dyn_cast(&I); - if (!LI) - return false; - if (!SafePtrs.count(LI->getPointerOperand())) { - // !llvm.mem.parallel_loop_access implies if-conversion safety. - // Otherwise, record that the load needs (real or emulated) masking - // and let the cost model decide. - if (!IsAnnotatedParallel) - MaskedOp.insert(LI); - continue; - } - } - - if (I.mayWriteToMemory()) { - auto *SI = dyn_cast(&I); - if (!SI) - return false; - // Predicated store requires some form of masking: - // 1) masked store HW instruction, - // 2) emulation via load-blend-store (only if safe and legal to do so, - // be aware on the race conditions), or - // 3) element-by-element predicate check and scalar store. - MaskedOp.insert(SI); - continue; - } - if (I.mayThrow()) - return false; - } - - return true; -} - void InterleavedAccessInfo::collectConstStrideAccesses( MapVector &AccessStrideInfo, const ValueToValueMap &Strides) { @@ -8680,7 +7230,7 @@ LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE, &Requirements, &Hints, DB, AC); - if (!LVL.canVectorize()) { + if (!LVL.canVectorize(EnableVPlanNativePath)) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints, ORE); return false; @@ -8752,8 +7302,8 @@ if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) + ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), + "NoImplicitFloat", L) << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints, ORE); return false; @@ -8767,7 +7317,7 @@ TTI->isFPVectorizationPotentiallyUnsafe()) { DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); ORE->emit( - createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) + createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) << "loop not vectorized due to unsafe FP support."); emitMissedWarning(F, L, Hints, ORE); return false;