Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -528,6 +528,18 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of the strided memory operation. + /// \p Opcode is the memory operation code + /// \p VecTy is the vector type of the interleaved access. + /// \p Factor is the interleave factor + /// \p Indices is the indices for interleaved load members (as interleaved + /// load allows gaps) + /// \p Alignment is the alignment of the memory operation + /// \p AddressSpace is address space of the pointer. + int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -692,6 +704,11 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace) = 0; + virtual int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -900,6 +917,12 @@ return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } + int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getStridedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + } int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -332,6 +332,15 @@ return 1; } + unsigned getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + return getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -595,6 +595,15 @@ return Cost; } + unsigned getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + return getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + } + /// Get intrinsic cost based on arguments unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Args, FastMathFlags FMF) { Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -331,6 +331,15 @@ return Cost; } +int TargetTransformInfo::getStridedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace) const { + int Cost = TTIImpl->getStridedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, FastMathFlags FMF) const { Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -682,6 +682,9 @@ unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; + /// Returns maximum supported Interleave factor. + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -76,6 +76,9 @@ unsigned AddressSpace); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); + int getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef Indices, unsigned Alignment, + unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, bool VariableMask, unsigned Alignment); int getAddressComputationCost(Type *PtrTy, bool IsComplex); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -26,6 +26,51 @@ #define DEBUG_TYPE "x86tti" +/// Returns number of valid memory in one VF access. +/// i.e. For Stride 2 and VF 4 +/// |---|---|---|---| +/// | a | * | b | * | +/// |---|---|---|---| +/// 0 1 2 3 +/// Only offset 0 & 2 holding value a & b are memory accesses of interest +/// So it will return 2. +/// NOTE: It assumes all iteration for a given stride holds common memory +/// accesses pattern. +static unsigned getValidMemInSingleVFAccess(unsigned VF, unsigned Stride) { + return ((VF - 1) / Stride) + 1; +} + +/// Returns number of load/store required to fill a vector register. +/// i.e. For Stride 3 and VF 4 +/// |---|---|---|---|---|---|---|---| +/// | a | * | * | b | c | * | * | d | +/// |---|---|---|---|---|---|---|---| +/// 0 1 2 3 6 7 8 9 +/// Offset 0, 3, 6, 9 required to fill vector register. +/// So 2 vector load will be requied. +/// NOTE: It assumes all iteration for a given stride holds common memory +/// accesses pattern. +static unsigned getRequiredLoadStore(unsigned VF, unsigned Stride) { + unsigned VMem = getValidMemInSingleVFAccess(VF, Stride); + return (VF / VMem) + ((VF % VMem) ? 1 : 0); +} + +/// Return number of shuffle required for promotion. +/// To promote shuffle to a upper type may need to generate additional +/// shuffle. +static unsigned getShuffleRequiredForPromotion(unsigned VF, unsigned Stride) { + unsigned ValidElements = getValidMemInSingleVFAccess(VF, Stride); + unsigned FixUpStride = 0; + // Identify valid elements in first 2 loads. + unsigned Prev = 2 * ValidElements; + // Iterate collected elements less than VF + while(Prev < VF) { + FixUpStride++; + Prev = Prev + ValidElements; + } + return FixUpStride; +} + //===----------------------------------------------------------------------===// // // X86 cost model. @@ -1100,6 +1145,53 @@ return Cost+LT.first; } +/// Returns strided memory load/store instruction vectorization cost. +/// In addition to load/store instruction it considers cost of +/// shuffle also. +int X86TTIImpl::getStridedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(Factor >= 2 && "Invalid interleave factor"); + assert(isa(VecTy) && "Expect a vector type"); + if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + // Input is WideVector type (i.e. VectorTy * Stride) + // First get the actual vector type. + Type *SubVecTy = VectorType::get(VecTy->getScalarType(), + (VecTy->getVectorNumElements() / Factor)); + // Identify vector factor and number of load/store required. + unsigned VF = SubVecTy->getVectorNumElements(); + unsigned MemOpRequired = getRequiredLoadStore(VF, Factor); + // Firstly, the cost of load/store operation. + unsigned Cost = getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace); + // Multiply Cost by number of load/store operation. + Cost = Cost * MemOpRequired; + unsigned ShuffleCost = 0; + // Identify number of shuffle required for promotion + // When types are different we cant directly shuffle elements + // because shuffle expects same type + // i.e. <4 x i32> * <2 x i32> is not possible, so its required + // to promote <2 x i32> to <4 x i32> with undefs vector. + unsigned TotalSufflePerMemOp = getShuffleRequiredForPromotion(VF, Factor); + // Identify cost of shuffle which required for promotion. + // NOTE: This is only applicable for load. + if(Opcode == Instruction::Load) + for (unsigned i = 0; i < MemOpRequired; i++) + for (unsigned i = 0; i < TotalSufflePerMemOp; i++) + ShuffleCost += getShuffleCost(TTI::SK_InsertSubvector, VecTy, 0, SubVecTy); + // Identify cost of shuffle required for multiple memory operation. + for (unsigned i = 0; i < (MemOpRequired - 1); i++) + ShuffleCost += getShuffleCost(TTI::SK_InsertSubvector, SubVecTy, 0, SubVecTy); + // Update final cost & return. + Cost += ShuffleCost; + return Cost; + } + // Default call Base's getInterleavedMemoryOpCost. + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} + int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -149,6 +149,17 @@ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +/// Option to enable strided vectorization. +static cl::opt EnableStridedVectorization( + "enable-strided-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable strided memory accesses vectorization in a loop")); + +/// Option to enable mask load generation, will work only +/// with strided vectorization. +static cl::opt EnableStridedMaskLoad( + "enable-strided-mask-load", cl::init(false), cl::Hidden, + cl::desc("Enable strided memory vectorization with maked load generation")); + /// Maximum factor for an interleaved memory access. static cl::opt MaxInterleaveGroupFactor( "max-interleave-group-factor", cl::Hidden, @@ -293,6 +304,47 @@ return nullptr; } +/// Returns SkipFactor. +/// SkipFactor is the additional number of offsets to move Index pointer after +/// a vector load/store. Index pointer is the start address of memory in a +/// particular loop iteration, from this address we will do a vector load/store. +/// Index pointer movement required for subsequent memory accesses in loop. +/// By introducing SkipFactor we ensures all vector memory accesses holds same +/// memory pattern. This is the reason why we use common mask for accesses +/// across iteration. +static unsigned getSkipFactor(unsigned VF, unsigned Stride) { + return (Stride - (VF % Stride)) % Stride; +} + +/// Returns number of valid memory in one VF access. +/// i.e. For Stride 2 and VF 4 +/// |---|---|---|---| +/// | a | * | b | * | +/// |---|---|---|---| +/// 0 1 2 3 +/// Only offset 0 & 2 holding value a & b are memory accesses of interest +/// So it will return 2. +/// NOTE: It assumes all iteration for a given stride holds common memory +/// accesses pattern. +static unsigned getValidMemInSingleVFAccess(unsigned VF, unsigned Stride) { + return ((VF - 1) / Stride) + 1; +} + +/// Returns number of load/store required to fill a vector register. +/// i.e. For Stride 3 and VF 4 +/// |---|---|---|---|---|---|---|---| +/// | a | * | * | b | c | * | * | d | +/// |---|---|---|---|---|---|---|---| +/// 0 1 2 3 6 7 8 9 +/// Offset 0, 3, 6, 9 required to fill vector register. +/// So 2 vector load will be requied. +/// NOTE: It assumes all iteration for a given stride holds common memory +/// accesses pattern. +static unsigned getRequiredLoadStore(unsigned VF, unsigned Stride) { + unsigned VMem = getValidMemInSingleVFAccess(VF, Stride); + return (VF / VMem) + ((VF % VMem) ? 1 : 0); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -440,6 +492,15 @@ /// Try to vectorize the interleaved access group that \p Instr belongs to. void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the stride memory access. + void vectorizeStridedMemoryAccess(Instruction *Instr); + + /// Vectorize the strided load. + void vectorizeStridedLoad(Instruction *Instr); + + /// Vectorize the strided store. + void vectorizeStridedStore(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -459,7 +520,17 @@ void emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); - + /// Returns mask required for masked load/store instruction. + Constant *getLoadStoreInstMask(IRBuilder<> &Builder, unsigned VF, + unsigned Stride, unsigned NumVec); + /// Returns mask required for shuffle store instruction. + Constant *getStoreShuffleVectorMask(IRBuilder<> &Builder, unsigned VF, + unsigned Stride, unsigned Part); + /// Returns mask required for shuffle load instruction. + Constant *getLoadShuffleVectorMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF, + bool isFirstOpShuffle, unsigned FirstOpLen, + unsigned SecondOpLen); /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -830,6 +901,142 @@ Instruction *InsertPos; }; +/// \brief The descriptor for a strided memory access. +struct StrideDescriptor { + StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align) + : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} + + StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} + + int Stride; // The access's stride. It is negative for a reverse access. + const SCEV *Scev; // The scalar expression of this access + unsigned Size; // The size of the memory object. + unsigned Align; // The alignment of this access. +}; + +class StrideAccessInfo { +public: + const Instruction *MemAccessInstr; + StrideDescriptor &SD; + bool hasValidStride; + ScalarEvolution *SE; + Loop *CurLoop; + LoopInfo *LInfo; + SmallVector DependentInstr; + + StrideAccessInfo(const Instruction *MemAccessInstr, StrideDescriptor &SD, + ScalarEvolution *SE, Loop *CurLoop, LoopInfo *LInfo) + : MemAccessInstr(MemAccessInstr), SD(SD), hasValidStride(false), SE(SE), + CurLoop(CurLoop), LInfo(LInfo) { + analyzeStride(MemAccessInstr); + } + + void insertDependentInstr(const Instruction *I) { + DependentInstr.push_back(I); + } + + SmallVector &getDependentInstr(const Instruction *I) { + return DependentInstr; + } + + bool isValidStrideOperation() const; + bool validateStrideOperation(Instruction *I); + void analyzeStride(const Instruction *); + + void print() const { + dbgs() << "Memory Instruction : "; + MemAccessInstr->dump(); + dbgs() << "Has dependent Instr \n"; + SmallVector::const_iterator I, E; + for (I = DependentInstr.begin(), E = DependentInstr.end(); I != E; ++I) { + dbgs() << " "; + (*I)->dump(); + dbgs() << "\n"; + } + } +}; + +/// Validates if a instruction holds a valid strided operation +/// As in index operation it only allows Add, Sub, Shl & Mul operations +/// in a index has a non constant stride then it returns false. +/// In case of PHI it should be InductionPHI. +bool StrideAccessInfo::validateStrideOperation(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Shl: + case Instruction::Mul: { + // Second operand should be a compile time constant. + ConstantInt *CV = dyn_cast(I->getOperand(1)); + if (!CV) + return false; + // First operand should be a valid instruction. + Instruction *Op0Inst = dyn_cast(I->getOperand(0)); + if (!Op0Inst) + return false; + // Analyze first operand further. + if (!validateStrideOperation(Op0Inst)) { + return false; + } + // Consider instruction as dependent & return true. + insertDependentInstr(I); + return true; + } + case Instruction::PHI: { + // The PHI should be a induction PHI, else return false + InductionDescriptor ID; + PHINode *Phi = dyn_cast(I); + if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { + return true; + } + return false; + } + default: + return false; + } + return false; +} + +/// Analyze stride operation in an instruction and mark it as valid +/// stride access, it considers operation involved in stride calculation +/// It only consider simple stride patern. +/// i.e. arr[i*X], arr[(i+1) * X], arr[(i*X)+1] (where X is a constant stride) +void StrideAccessInfo::analyzeStride(const Instruction *I) { + const LoadInst *Ld = dyn_cast(I); + const StoreInst *St = dyn_cast(I); + assert((Ld || St) && "Invalid instruction, not load or store"); + const Value *Ptr = Ld ? Ld->getPointerOperand() : St->getPointerOperand(); + + const GetElementPtrInst *Gep = dyn_cast(Ptr); + if (!Gep) + return; + unsigned InductionOperand = getGEPInductionOperand(Gep); + const SCEV *IndGep = SE->getSCEV(Gep->getOperand(InductionOperand)); + const SCEVAddRecExpr *AR = dyn_cast(IndGep); + if (!AR || !AR->isAffine()) + return; + for (unsigned int i = 0; i < Gep->getNumOperands(); i++) { + Instruction *UI = dyn_cast(Gep->getOperand(i)); + if (!UI) + continue; + // consider only loop instructions + if (LInfo->getLoopFor(UI->getParent()) != CurLoop) + continue; + // Return when invalid stride operation. + if (!validateStrideOperation(UI)) + return; + } + hasValidStride = true; + DEBUG(dbgs() << "LV: Identified valid stride operation for strided" + << " memory vectorization, from instruction:\n" + << *I << "\n"); + return; +} + +/// For a strided memory access, it returns true if hold valid stride +/// operation, else it returns false. +bool StrideAccessInfo::isValidStrideOperation() const { return hasValidStride; } + /// \brief Drive the analysis of interleaved memory accesses in the loop. /// /// Use this class to analyze interleaved accesses only when we can vectorize @@ -855,7 +1062,7 @@ /// \brief Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(const ValueToValueMap &Strides); + void analyzeInterleaving(const ValueToValueMap &Strides, LoopInfo *LInfo); /// \brief Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { @@ -879,6 +1086,19 @@ return nullptr; } + /// For a strided memory access, it returns true if hold valid stride + /// operation, else it returns false. + bool isValidStrideOperation(Instruction *I) { + MapVector::iterator It = + StrideInfo.find(I); + if (It != StrideInfo.end()) + return It->second.isValidStrideOperation(); + return false; + } + + /// Holds instruction & its stride access information. + MapVector StrideInfo; + /// \brief Returns true if an interleaved group that may access memory /// out-of-bounds requires a scalar epilogue iteration for correctness. bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } @@ -900,20 +1120,6 @@ /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; - /// \brief The descriptor for a strided memory access. - struct StrideDescriptor { - StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, - unsigned Align) - : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} - - StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} - - int Stride; // The access's stride. It is negative for a reverse access. - const SCEV *Scev; // The scalar expression of this access - unsigned Size; // The size of the memory object. - unsigned Align; // The alignment of this access. - }; - /// \brief Create a new interleave group with the given instruction \p Instr, /// stride \p Stride and alignment \p Align. /// @@ -938,7 +1144,7 @@ /// \brief Collect all the accesses with a constant stride in program order. void collectConstStridedAccesses( MapVector &StrideAccesses, - const ValueToValueMap &Strides); + const ValueToValueMap &Strides, LoopInfo * LInfo); }; /// Utility class for getting and setting loop vectorizer hints in the form @@ -1271,11 +1477,11 @@ const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA, LoopVectorizationRequirements *R, - LoopVectorizeHints *H) + LoopVectorizeHints *H, LoopInfo * LInfo) : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} + Requirements(R), Hints(H), LInfo(LInfo) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1403,6 +1609,15 @@ unsigned getNumLoads() const { return LAI->getNumLoads(); } unsigned getNumPredStores() const { return NumPredStores; } + /// \brief Get the interleaved access group that \p Instr belongs to. + MapVector &getStrideAccessGroup() { + return InterleaveInfo.StrideInfo; + } + /// For a strided memory access, it returns true if hold valid stride + /// operation, else it returns false. + bool isValidStrideOperation(Instruction * I) { + return InterleaveInfo.isValidStrideOperation(I); + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -1515,6 +1730,7 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + LoopInfo * LInfo; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1888,7 +2104,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, - &Requirements, &Hints); + &Requirements, &Hints, LI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -2289,6 +2505,88 @@ "reverse"); } +/// Based on number of element it will generate mask for load & store. +/// i.e. if stride is 2 and VF is 4, valid elements are 2 +/// In this case it will generate +/// Mask: +/// If valid elements is 1 (in some case) then it will generate +/// Mask: +/// Here valid elements(NumVec) plays vital role, once valid elements are +/// over, for remaining it will mark 0 as mask value. +Constant *InnerLoopVectorizer::getLoadStoreInstMask(IRBuilder<> &Builder, + unsigned VF, + unsigned Stride, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) { + bool M = NumVec && ((i % Stride) ? 0 : 1); + Mask.push_back(Builder.getInt1(M)); + if (M) + NumVec = NumVec - 1; + } + return ConstantVector::get(Mask); +} + +/// Create shuffle vector's mask for mask-store +/// i.e stride 3 store with VF 4 +/// It will generate following 2 mask for different part. +/// in 2 iterations: +/// Part#1 Mask: +/// Part#2 Mask: +Constant *InnerLoopVectorizer::getStoreShuffleVectorMask(IRBuilder<> &Builder, + unsigned VF, + unsigned Stride, + unsigned Part) { + unsigned ValidElements = + VF - (getValidMemInSingleVFAccess(VF, Stride) * Part); + unsigned StartIndex = VF + Part * getValidMemInSingleVFAccess(VF, Stride); + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) { + // If its not our element then place i + // If valid elements are over push undef index + if (!ValidElements || (i % Stride)) { + Mask.push_back(Builder.getInt32(i)); + } else { + // If its our element then place StartIndex + Mask.push_back(Builder.getInt32(StartIndex)); + StartIndex = StartIndex + 1; + ValidElements = ValidElements - 1; + } + } + return ConstantVector::get(Mask); +} + +/// Create mask for load instruction based on operands. +/// i.e. for stride 3 and vector factor 4 it will generate +/// mask: +/// Any of the operand can be shuffle as well in that case. i.e. +/// First operand: +/// %shuf.vec = shufflevector <4 x i32> +/// Second operand: +/// %load.vec = load <4 x i32> +/// In this case it will generate mask: +/// NOTE: If any operand is shuffle, it has to be first operand. +Constant *InnerLoopVectorizer::getLoadShuffleVectorMask( + IRBuilder<> &Builder, unsigned Start, unsigned Stride, unsigned VF, + bool isFirstOpShuffle, unsigned FirstOpLen, unsigned SecondOpLen) { + SmallVector Mask; + if (isFirstOpShuffle) { + for (unsigned i = 0; i < FirstOpLen; i++) { + Mask.push_back(Builder.getInt32(Start + i)); + } + } else { + for (unsigned i = 0; i < FirstOpLen; i++) { + Mask.push_back(Builder.getInt32(Start + (i * Stride))); + } + } + // For second vector move the start by VF. + Start = Start + VF; + for (unsigned i = 0; i < SecondOpLen; i++) { + Mask.push_back(Builder.getInt32(Start + i * Stride)); + } + return ConstantVector::get(Mask); +} + // Get a mask to interleave \p NumVec vectors into a wide vector. // I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> // E.g. For 2 interleaved vectors, if VF is 4, the mask is: @@ -2421,6 +2719,13 @@ if (Instr != Group->getInsertPos()) return; + // Vectorize strided memory if memory instruction holds a valid strided + if (EnableStridedVectorization && Legal->isValidStrideOperation(Instr) && + (Group->getNumMembers() == 1)) { + vectorizeStridedMemoryAccess(Instr); + return; + } + LoadInst *LI = dyn_cast(Instr); StoreInst *SI = dyn_cast(Instr); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); @@ -2534,6 +2839,244 @@ } } +/// Vectorize strided load +/// It generates multiple vector load by considering skip factor. +/// Skip factor is the offset gap between 2 loads. +/// It does multiple loads followed by shuffle to extract elements. +/// +/// i.e. for stride 2, with vector factor 4 +/// <...> = arr[i*2] +/// Will generate following instructions: +/// +/// %1 = getelementptr inbounds [100 x i32], [100 x i32]* @a, i64 0, i64 %0 +/// %2 = bitcast i32* %1 to <4 x i32>* +/// %stride.load = load <4 x i32>, <4 x i32>* %2 +/// +/// %3 = getelementptr i32, i32* %1, i64 4 +/// %4 = bitcast i32* %3 to <4 x i32>* +/// %stride.load18 = load <4 x i32>, <4 x i32>* %4, align 16, !tbaa !1 +/// %strided.vec = shufflevector <4 x i32> %stride.load, <4 x i32> %stride.load18, <4 x i32> +/// +/// For stride 2, skip factor will be 0. +/// +/// For memory safety if user wants to generate vector mask-load, its supported +/// under "-enable-strided-mask-load" command line option. +void InnerLoopVectorizer::vectorizeStridedLoad(Instruction *Instr) { + DEBUG(dbgs() << "LV: Vectorizing strided load: " << *Instr << "\n"); + const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + unsigned InterleaveFactor = Group->getFactor(); + LoadInst *LI = dyn_cast(Instr); + Value *Ptr = LI->getPointerOperand(); + Type *ScalarTy = LI->getType(); + Type *VecTy = VectorType::get(ScalarTy, VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + unsigned SkipFactor = getSkipFactor(VF, InterleaveFactor); + unsigned NumLoadRequired = getRequiredLoadStore(VF, InterleaveFactor); + unsigned NumMemAccessInVF = getValidMemInSingleVFAccess(VF, InterleaveFactor); + + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + + // Consider Unroll + // Widen instructions based on unroll factor. + // PtrParts[URIndex] holds memory address in a particular unroll index + // For each unroll index with a start address widen memory instructions + // by generating multipe unroll versions. + for (unsigned URIndex = 0; URIndex < UF; URIndex++) { + // Identify start address. + Value *StartAddress = Builder.CreateExtractElement( + PtrParts[URIndex], + Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + + Instruction *PrevInstr = nullptr; + unsigned Index = 0; + // Generate required loads, based on Index + for (unsigned Part = 0; Part < NumLoadRequired; Part++) { + // Create GEP based on Index + // If stride is 2, VF 4 then in that case starting offset will start + // from: + // %StartIndex = shl i64 %index, 1 + // + // In case of Reverse Loop: + // .lhs = shl i64 %offset.idx, 1 + // %StartIndex = add i64 %.lhs, -6 + Value *NewPtr = Builder.CreateGEP(StartAddress, Builder.getInt32(Index)); + Instruction *NewLoadInstr = nullptr; + bool LastIteration = (Part + 1 >= NumLoadRequired); + // Make last load as masked load + if (EnableStridedMaskLoad && LastIteration) { + unsigned ElementsToLoad = VF - (NumMemAccessInVF * Part); + Constant *LoadMask = + getLoadStoreInstMask(Builder, VF, InterleaveFactor, ElementsToLoad); + NewLoadInstr = Builder.CreateMaskedLoad( + Builder.CreateBitCast(NewPtr, PtrTy), Group->getAlignment(), + LoadMask, UndefValue::get(VecTy), "wide.masked.load"); + } else { + // Alignments smaller than the target default indicate that the code + // generator for the target needs to generate the unaligned load/store. + // When SkipFactor is 0, generate aligned access else generate + // unaligned memory access. + // NOTE: Alignment value 1 indicates unaligned access. + NewLoadInstr = Builder.CreateAlignedLoad( + Builder.CreateBitCast(NewPtr, PtrTy), + (SkipFactor ? 1 : Group->getAlignment()), "stride.load"); + } + propagateMetadata(NewLoadInstr, Instr); + + if (PrevInstr == nullptr) { + PrevInstr = NewLoadInstr; + } else { + // Check if previous instruction is shuffle + bool isShuffle = dyn_cast(PrevInstr) ? true : false; + // Identify number of elements to shuffle from first load + unsigned FirstOpLen = isShuffle + ? dyn_cast(PrevInstr) + ->getShuffleMask() + .size() + : NumMemAccessInVF; + // Identify number of elements to shuffle from second load + unsigned SecondOpLen = + LastIteration ? (VF - (NumMemAccessInVF * Part)) : NumMemAccessInVF; + // Create input mask for shuffle + Constant *StrideMask = getLoadShuffleVectorMask( + Builder, 0, InterleaveFactor, VF, isShuffle, FirstOpLen, + SecondOpLen); + if (isShuffle) { + // Making type equal for shuffle. + while (PrevInstr->getType() != NewLoadInstr->getType()) { + unsigned FirstOpLen = + dyn_cast(PrevInstr)->getShuffleMask().size(); + unsigned SecondOpLen = VF - FirstOpLen; + Constant *MakeUpMask = getLoadShuffleVectorMask( + Builder, 0, 1, + dyn_cast(PrevInstr)->getShuffleMask().size(), + isShuffle, FirstOpLen, SecondOpLen); + Value *MakeUpShuffle = Builder.CreateShuffleVector( + PrevInstr, UndefValue::get(PrevInstr->getType()), MakeUpMask, + "strided.vec"); + PrevInstr = dyn_cast(MakeUpShuffle); + } + } + // Create shuffle vector + Value *StridedVec = Builder.CreateShuffleVector( + PrevInstr, NewLoadInstr, StrideMask, "strided.vec"); + // If this member has different type, cast the result type. + if (Instr->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Instr->getType(), VF); + StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); + } + // Update instruction to WidenMap. + VectorParts &Entry = WidenMap.get(Instr); + Entry[URIndex] = + Group->isReverse() ? reverseVector(StridedVec) : StridedVec; + PrevInstr = dyn_cast(StridedVec); + } + // Move Index. + Index = (NumMemAccessInVF * (Part + 1)) * InterleaveFactor; + } + } + return; +} + +/// Vectorize strided store +/// It generates multiple vector mask-stores by considering skip factor. +/// Will generate shuffle followed by mask-store to insert elements. +/// Skip factor is the offset gap between 2 stores. +/// i.e. for stride 3, with vector factor 4 +/// arr[3*i] = <...> +/// Will generate following instructions: +/// +/// %8 = getelementptr inbounds i32, i32* %arr, i64 %0 +/// %9 = bitcast i32* %8 to <4 x i32>* +/// %interleaved.vec = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> +/// call void @llvm.masked.store.v4i32(<4 x i32> %interleaved.vec, <4 x i32>* %9, i32 4, <4 x i1> ) +/// +/// %10 = getelementptr i32, i32* %8, i64 6 +/// %11 = bitcast i32* %10 to <4 x i32>* +/// %interleaved.vec19 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> +/// call void @llvm.masked.store.v4i32(<4 x i32> %interleaved.vec19, <4 x i32>* %11, i32 4, <4 x i1> ) +/// +/// For stride 3, skip factor will be 2. +void InnerLoopVectorizer::vectorizeStridedStore(Instruction *Instr) { + DEBUG(dbgs() << "LV: Vectorizing strided store: " << *Instr << "\n"); + const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + unsigned InterleaveFactor = Group->getFactor(); + StoreInst *SI = dyn_cast(Instr); + Value *Ptr = SI->getPointerOperand(); + // Prepare for the vector type of the strided store. + Type *ScalarTy = SI->getValueOperand()->getType(); + Type *VecTy = VectorType::get(ScalarTy, VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + // Check required number of stores & valid elements in each memory access. + unsigned NumStoreRequired = getRequiredLoadStore(VF, InterleaveFactor); + unsigned NumMemAccessInVF = getValidMemInSingleVFAccess(VF, InterleaveFactor); + + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + VectorParts StoredVal = getVectorValue(SI->getValueOperand()); + Value *UndefVec = UndefValue::get(VecTy); + // Consider Unroll. + // Widen instructions based on unroll factor. + // PtrParts[URIndex] holds memory address in a particular unroll index + // For each unroll index with a start address widen memory instructions + // by generating multipe unroll versions. + for (unsigned URIndex = 0; URIndex < UF; URIndex++) { + unsigned Index = 0; + // Identify starting address + Value *StartAddress = Builder.CreateExtractElement( + PtrParts[URIndex], + Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + // Generate required stores. + for (unsigned Part = 0; Part < NumStoreRequired; Part++) { + // Create GEP based on Index + // If stride is 2, VF 4 then in that case starting offset will start + // from: + // %StartIndex = shl i64 %index, 1 + // + // In case of Reverse Loop: + // .lhs = shl i64 %offset.idx, 1 + // %StartIndex = add i64 %.lhs, -6 + Value *NewPtr = Builder.CreateGEP(StartAddress, Builder.getInt32(Index)); + NewPtr = Builder.CreateBitCast(NewPtr, PtrTy); + // Interleave the elements in the wide vector. + Constant *ShuffleMask = + getStoreShuffleVectorMask(Builder, VF, InterleaveFactor, Part); + // Create ShuffleVector, if loop is reverse then use reverse vector. + Value *ShuffleVector = Builder.CreateShuffleVector( + UndefVec, (Group->isReverse() ? reverseVector(StoredVal[URIndex]) + : StoredVal[URIndex]), + ShuffleMask, "interleaved.vec"); + // Get mask-store instruction's mask + Value *StoreMask = + getLoadStoreInstMask(Builder, VF, InterleaveFactor, NumMemAccessInVF); + // Generate mask-store instruction. + Instruction *NewStoreInstr = Builder.CreateMaskedStore( + ShuffleVector, NewPtr, Group->getAlignment(), StoreMask); + propagateMetadata(NewStoreInstr, Instr); + // Move Index. + Index = (NumMemAccessInVF * (Part + 1)) * InterleaveFactor; + } + } +} + +/// Try to vectorize strided load and store memory accesses. +/// Will generate multiple load operations followed by shuffle to +/// model extract elements. For store it generate multiple shuffle followed +/// by mask-store instructions. +void InnerLoopVectorizer::vectorizeStridedMemoryAccess(Instruction *Instr) { + LoadInst *LI = dyn_cast(Instr); + StoreInst *SI = dyn_cast(Instr); + if (LI) { + vectorizeStridedLoad(Instr); + } else if (SI) { + vectorizeStridedStore(Instr); + } else { + DEBUG(dbgs() << "LV : Unsupported instruction for Strided Vectorization" + << "\n"); + } + return; +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -4596,7 +5139,7 @@ // Analyze interleaved memory accesses. if (UseInterleaved) - InterleaveInfo.analyzeInterleaving(Strides); + InterleaveInfo.analyzeInterleaving(Strides, LInfo); unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) @@ -5044,7 +5587,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( MapVector &StrideAccesses, - const ValueToValueMap &Strides) { + const ValueToValueMap &Strides, LoopInfo *LInfo) { // Holds load/store instructions in program order. SmallVector AccessList; @@ -5090,6 +5633,12 @@ Align = DL.getABITypeAlignment(PtrTy->getElementType()); StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); + // Update StrideInfo for instruction. + if (EnableStridedVectorization) { + StrideInfo.insert(std::pair( + I, + StrideAccessInfo(I, StrideAccesses[I], PSE.getSE(), TheLoop, LInfo))); + } } } @@ -5114,12 +5663,12 @@ // The store group of (2) is always inserted at or below (2), and the load group // of (1) is always inserted at or above (1). The dependence is safe. void InterleavedAccessInfo::analyzeInterleaving( - const ValueToValueMap &Strides) { + const ValueToValueMap &Strides, LoopInfo *LInfo) { DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); // Holds all the stride accesses. MapVector StrideAccesses; - collectConstStridedAccesses(StrideAccesses, Strides); + collectConstStridedAccesses(StrideAccesses, Strides, LInfo); if (StrideAccesses.empty()) return; @@ -5202,8 +5751,14 @@ // Remove interleaved store groups with gaps. for (InterleaveGroup *Group : StoreGroups) - if (Group->getNumMembers() != Group->getFactor()) - releaseGroup(Group); + if (Group->getNumMembers() != Group->getFactor()) { + Instruction *I = Group->getMember(0); + // Dont release group, if group has only 1 element & that element + // has valid strided operation. Else release it. + if (!(EnableStridedVectorization && Group->getNumMembers() == 1 && + isValidStrideOperation(I))) + releaseGroup(Group); + } // If there is a non-reversed interleaved load group with gaps, we will need // to execute at least one scalar epilogue iteration. This will ensure that @@ -6036,12 +6591,19 @@ if (Group->getMember(i)) Indices.push_back(i); } - - // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS); - + unsigned Cost = 0; + // If its a valid stride then get of cost of memory operations + // involved in it, else get the cost of interleave memory group. + if (EnableStridedVectorization && Legal->isValidStrideOperation(I) && + (Group->getNumMembers() == 1)) { + Cost = TTI.getStridedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + } else { + Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + } if (Group->isReverse()) Cost += Group->getNumMembers() *