Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -501,6 +501,11 @@ const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr = nullptr); +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const ValueToValueMap &StridesMap); + /// \brief This analysis provides dependence information for the memory accesses /// of a loop. /// Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,20 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of the interleaved memory operation. + /// \p Opcode is the memory operation code + /// \p VecTy is the vector type of the interleaved access. + /// \p Delta is the interleave factor + /// \p Indices is the indices for interleaved load members (as interleaved + /// load allows gaps) + /// \p Alignment is the alignment of the memory operation + /// \p AddressSpace is address space of the pointer. + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +596,11 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +759,14 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices, + Alignment, AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,14 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -522,6 +522,73 @@ return Cost; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *VT = dyn_cast(VecTy); + assert(VT && "Expect a vector type for interleaved memory op"); + + unsigned NumElts = VT->getNumElements(); + assert(Delta > 1 && NumElts % Delta == 0 && "Invalid Delta"); + + unsigned NumSubElts = NumElts / Delta; + VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); + + // Firstly, the cost of load/store operation. + unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // Then plus the cost of interleave operation. + if (Opcode == Instruction::Load) { + // The interleave cost is similar to extract sub vectors' elements + // from the wide vector, and insert them into sub vectors. + // + // E.g. Interleaved load of Delta 2 to 1 sub vectors (%v0): + // %vec = load <8 x i32>, <8 x i32>* %ptr + // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 + // The cost is estimated as extract elements at 0, 2, 4, 6 from the + // <8 x i32> vector and insert them into a <4 x i32> vector. + + assert(Indices.size() <= Delta && + "Interleaved memory op has too many members"); + for (unsigned Index : Indices) { + assert(Index < Delta && "Invalid index for interleaved memory op"); + + // Extract elements from loaded vector for each sub vector. + for (unsigned i = 0; i < NumSubElts; i++) + Cost += getVectorInstrCost(Instruction::ExtractElement, VT, + Index + i * Delta); + } + + unsigned InsSubCost = 0; + for (unsigned i = 0; i < NumSubElts; i++) + InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i); + + Cost += Indices.size() * InsSubCost; + } else { + // The interleave cost is extract all elements from sub vectors, and + // insert them into the wide vector. + // + // E.g. Interleaved store of Delta 2 (For vector %v0, %v1): + // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> + // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr + // The cost is estimated as extract all elements from both <4 x i32> + // vectors and insert into the <8 x i32> vector. + + unsigned ExtSubCost = 0; + for (unsigned i = 0; i < NumSubElts; i++) + ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i); + + Cost += Delta * ExtSubCost; + + for (unsigned i = 0; i < NumElts; i++) + Cost += getVectorInstrCost(Instruction::InsertElement, VT, i); + } + + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -289,11 +289,6 @@ return AR->isAffine(); } -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap); - bool AccessAnalysis::canCheckPtrAtRT( LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap, @@ -510,8 +505,8 @@ } /// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap) { +int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const ValueToValueMap &StridesMap) { const Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -678,6 +673,47 @@ return false; } +/// \brief Check the dependence for two accesses with the same stride \p Stride. +/// \p Distance is the positive distance and \p TypeByteSize is type size in +/// bytes. +/// +/// \returns true if they are independent. +static bool areStridedAccessesIndependent(unsigned Distance, unsigned Stride, + unsigned TypeByteSize) { + assert(Stride > 1 && "The stride must be greater than 1"); + assert(TypeByteSize > 0 && "The type size in byte must be non-zero"); + assert(Distance > 0 && "The distance must be non-zero"); + + // Skip if the distance is not multiple of type byte size. + if (Distance % TypeByteSize) + return false; + + unsigned ScaledDist = Distance / TypeByteSize; + + // (1) If the scaled distance is less than the stride. + // E.g. + // for (i = 0; i < 1024 ; i += 4) + // A[i+2] = A[i] + 1; + // + // Two accesses in memory (scaled distance is 2, stride is 4): + // | A[0] | | | | A[4] | | | | + // | | | A[2] | | | | A[6] | | + // + // (2) Otherwise, no dependence if the scaled distance is not multiple of + // the stride. + // E.g. + // for (i = 0; i < 1024 ; i += 3) + // A[i+4] = A[i] + 1; + // + // Two accesses in memory (scaled distance is 4, stride is 3): + // | A[0] | | | A[3] | | | A[6] | | | + // | | | | | A[4] | | | A[7] | | + if (ScaledDist < Stride) + return true; + else + return ScaledDist % Stride; +} + MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, @@ -762,8 +798,15 @@ // Write to the same location with the same size. // Could be improved to assert type sizes are the same (i32 == float, etc). if (Val == 0) { - if (ATy == BTy) + if (ATy == BTy) { + // If it is a store-load pair with distance 0, it is store-load forwarding + // and should not be vectorized. + if (AIsWrite && !BIsWrite) + return Dependence::ForwardButPreventsForwarding; + return Dependence::NoDep; + } + DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n"); return Dependence::Unknown; } @@ -778,34 +821,87 @@ unsigned Distance = (unsigned) Val.getZExtValue(); + unsigned Stride = std::abs(StrideAPtr); + if (Stride > 1 && + areStridedAccessesIndependent(Distance, Stride, TypeByteSize)) + return Dependence::NoDep; + // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); + // The minimum number of iterations for a vectorized/unrolled version. + unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U); + + // It's not vectorizable if the distance is smaller than the minimum distance + // needed for a vectroized/unrolled version. Vectorizing one iteration in + // front needs TypeByteSize * Stride. Vectorizing the last iteration needs + // TypeByteSize (No need to plus the last gap distance). + // + // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. + // foo(int *A) { + // int *B = (int *)((char *)A + 14); + // for (i = 0 ; i < 1024 ; i += 2) + // B[i] = A[i] + 1; + // } + // + // Two accesses in memory (stride is 2): + // | A[0] | | A[2] | | A[4] | | A[6] | | + // | B[0] | | B[2] | | B[4] | + // + // Distance needs for vectorizing iterations except the last iteration: + // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4. + // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. + // + // If MinNumIter is 2, it is vectorizable as the minimum distance needed is + // 12, which is less than distance. + // + // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), + // the minimum distance needed is 28, which is greater than distance. It is + // not safe to do vectorization. + unsigned MinDistanceNeeded = + TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize; + if (MinDistanceNeeded > Distance) { + DEBUG(dbgs() << "LAA: Failure because of positive distance " << Distance + << '\n'); + return Dependence::Backward; + } - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LAA: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); + // Unsafe if the minimum size needed is greater than the max safe distance. + if (MinDistanceNeeded > MaxSafeDepDistBytes) { + DEBUG(dbgs() << "LAA: Failure because it needs at least " + << MinDistanceNeeded << " size in bytes"); return Dependence::Backward; } // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + // FIXME: Should use max factor instead of max distance in bytes, which could + // not handle different types. + // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. + // void foo (int *A, char *B) { + // for (unsigned i = 0; i < 1024; i++) { + // A[i+2] = A[i] + 1; + // B[i+2] = B[i] + 1; + // } + // } + // + // This case is currently unsafe according to the max safe distance. If we + // analyze the two accesses on array B, the max safe dependence distance + // is 2. Then we analyze the accesses on array A, the minimum distance needed + // is 8, which is less than 2 and forbidden vectorization, But actually + // both A and B could be vectorized by 2 iterations. + MaxSafeDepDistBytes = + Distance < MaxSafeDepDistBytes ? Distance : MaxSafeDepDistBytes; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && couldPreventStoreLoadForward(Distance, TypeByteSize)) return Dependence::BackwardVectorizableButPreventsForwarding; - DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); + DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() + << " with max VF = " + << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n'); return Dependence::BackwardVectorizable; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Delta, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices, + Alignment, AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -34,6 +34,10 @@ // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // +// The interleaved access vectorization is based on the paper: +// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved +// Data for SIMD +// // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // @@ -134,6 +138,15 @@ "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); +static cl::opt EnableInterleavedMemAccesses( + "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on interleaved memory accesses in a loop")); + +/// Maximum stride for an interleaved memory access. +static cl::opt MaxInterleaveStride( + "max-interleave-access-stride", cl::Hidden, + cl::desc("Maximum interleaved access stride (default = 8)"), cl::init(8)); + /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; @@ -351,6 +364,9 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -545,6 +561,219 @@ propagateMetadata(I, From); } +/// \brief The group of interleaved loads/stores sharing the same stride and +/// close to each other. +/// +/// Each member in this group has an index starting from 0, and the largest +/// index should be less than Delta (interleaved factor), which is the absolute +/// value of the access stride. +/// +/// E.g. An interleaved load group of Delta 4: +/// for (unsigned i = 0; i < 1024; i+=4) { +/// a = A[i]; // Member of index 0 +/// b = A[i+1]; // Member of index 1 +/// d = A[i+3]; // Member of index 3 +/// ... +/// } +/// +/// An interleaved store group of Delta 4: +/// for (unsigned i = 0; i < 1024; i+=4) { +/// ... +/// A[i] = a; // Member of index 0 +/// A[i+1] = b; // Member of index 1 +/// A[i+2] = c; // Member of index 2 +/// A[i+3] = d; // Member of index 3 +/// } +/// +/// Note: the interleaved load group could have gap (missing members), but +/// the interleaved store group doesn't allow gap. +class InterleaveGroup { +public: + InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) + : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) { + assert(Align && "The alignment should be non-zero"); + + Delta = std::abs(Stride); + assert(Delta > 1 && "Invalid interleave factor"); + + Reverse = Stride < 0; + Members[0] = Instr; + } + + bool isReverse() const { return Reverse; } + unsigned getDelta() const { return Delta; } + unsigned getAlignment() const { return Align; } + unsigned getNumMembers() const { return Members.size(); } + + /// \brief Try to insert a new member \p Instr with index \p Index and + /// alignment \p NewAlign. The index is related to the leader and it could be + /// negative if it is the new leader. + /// + /// \returns false if the instruction doesn't belong to the group. + bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { + assert(NewAlign && "The new member's alignment should be non-zero"); + + int Key = Index + SmallestKey; + + // Skip if there is already a member with the same index. + if (Members.count(Key)) + return false; + + if (Key > LargestKey) { + // The largest index is always less than Delta. + if (Index >= static_cast(Delta)) + return false; + + LargestKey = Key; + } else if (Key < SmallestKey) { + // The largest index is always less than Delta. + if (LargestKey - Key >= static_cast(Delta)) + return false; + + SmallestKey = Key; + } + + // It's always safe to select the minimum alignment. + Align = std::min(Align, NewAlign); + Members[Key] = Instr; + return true; + } + + /// \brief Get the member with the given index \p Index + /// + /// \returns nullptr if contains no such member. + Instruction *getMember(unsigned Index) const { + int Key = SmallestKey + Index; + if (!Members.count(Key)) + return nullptr; + + return Members.find(Key)->second; + } + + /// \brief Get the index for the given member. Unlike the key in the member + /// map, the index starts from 0. + unsigned getIndex(Instruction *Instr) const { + for (auto I : Members) + if (I.second == Instr) + return I.first - SmallestKey; + + llvm_unreachable("InterleaveGroup contains no such member"); + } + + Instruction *getInsertPos() const { return InsertPos; } + void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + +private: + unsigned Delta; // Interleave Factor. + bool Reverse; + unsigned Align; + DenseMap Members; + int SmallestKey; + int LargestKey; + + // To avoid breaking dependences, vectorized instructions of an interleave + // group should be inserted at either the first load or the last store in + // program order. + // + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 + // + // store i32 %even + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; +}; + +/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// +/// Call this class to analyze interleaved accesses only when we can vectorize +/// a loop. Otherwise it's meaningless to do analysis as the vectorization +/// on interleaved accesses is unsafe. +/// +/// The analysis collects interleave groups and records the relationships +/// between the member and the group in a map. +class InterleavedAccessInfo { +public: + InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) + : SE(SE), TheLoop(L), DT(DT) {} + + ~InterleavedAccessInfo() { + SmallSet DelSet; + // Avoid releasing a pointer twice. + for (auto &I : InterleaveGroupMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + + /// \brief Analyze the interleaved accesses and collect them in interleave + /// groups. Substitute symbolic strides using \p Strides. + void analyzeInterleaving(const ValueToValueMap &Strides); + + /// \brief Check if \p Instr belongs to any interleave group. + bool isInterleaved(Instruction *Instr) const { + return InterleaveGroupMap.count(Instr); + } + + /// \brief Get the interleave group that \p Instr belongs to. + /// + /// \returns nullptr if doesn't have such group. + InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { + if (InterleaveGroupMap.count(Instr)) + return InterleaveGroupMap.find(Instr)->second; + return nullptr; + } + +private: + ScalarEvolution *SE; + Loop *TheLoop; + DominatorTree *DT; + + /// Holds the relationships between the members and the interleave group. + DenseMap InterleaveGroupMap; + + /// \brief The descriptor for a strided memory access. + struct StrideDescriptor { + StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, + unsigned Align) + : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} + + StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} + + int Stride; // The access's stride. It is negative for a reverse access. + const SCEV *Scev; // The scalar expression of this access + unsigned Size; // The size of the memory object. + unsigned Align; // The alignment of this access. + }; + + /// \brief Create a new interleave group with the given instruction \p Instr, + /// stride \p Stride and alignment \p Align. + /// + /// \returns the newly created interleave group. + InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, + unsigned Align) { + assert(!InterleaveGroupMap.count(Instr) && + "Already in an interleaved access group"); + InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); + return InterleaveGroupMap[Instr]; + } + + /// \brief Release the group and remove all the relationships. + void releaseGroup(InterleaveGroup *Group) { + for (unsigned i = 0; i < Group->getDelta(); i++) + if (Instruction *Member = Group->getMember(i)) + InterleaveGroupMap.erase(Member); + + delete Group; + } + + /// \brief Collect all the accesses with a constant stride in program order. + void collectConstStridedAccesses( + MapVector &StrideAccesses, + const ValueToValueMap &Strides); +}; + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -565,8 +794,8 @@ Function *F, const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA) : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of inductions that we support. enum InductionKind { @@ -697,6 +926,16 @@ return LAI; } + /// \brief Check if \p Instr belongs to any interleaved access group. + bool isAccessInterleaved(Instruction *Instr) { + return InterleaveInfo.isInterleaved(Instr); + } + + /// \brief Get the interleaved access group that \p Instr belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + return InterleaveInfo.getInterleaveGroup(Instr); + } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } @@ -792,6 +1031,10 @@ // null until canVectorizeMemory sets it up. const LoopAccessInfo *LAI; + /// The interleave access information contains groups of interleaved accesses + /// with the same stride and close to each other. + InterleavedAccessInfo InterleaveInfo; + // --- vectorization state --- // /// Holds the integer induction variable. This is the counter of the @@ -1657,6 +1900,251 @@ "reverse"); } +// Get a mask to interleave \p NumVec vectors into a wide vector. +// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index \p Start. +// I.e. +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consists of sequential integers +// starting from 0, The second part consists of UNDEFs. +// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, + unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = 0; i < NumInt; i++) + Mask.push_back(Builder.getInt32(i)); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 2nd vector should +// not have more elements than the 1st vector. If the 2nd vector has less +// elements, extend it with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, + Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); + + if (NumElts1 > NumElts2) { + // Extend with UNDEFs. + Constant *ExtMask = + getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); + V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); + } + + Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); +} + +// Concatenate vectors in the given list. All vectors have the same type. +static Value *ConcatenateVectors(IRBuilder<> &Builder, + ArrayRef InputList) { + unsigned NumVec = InputList.size(); + assert(NumVec > 1 && "Should be at least two vectors"); + + SmallVector ResList; + ResList.append(InputList.begin(), InputList.end()); + do { + SmallVector TmpList; + for (unsigned i = 0; i < NumVec - 1; i += 2) { + Value *V0 = ResList[i], *V1 = ResList[i + 1]; + assert((V0->getType() == V1->getType() || i == NumVec - 2) && + "Only the last vector may have a different type"); + + TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1)); + } + + // Push the last vector if the total number of vectors is odd. + if (NumVec % 2 != 0) + TmpList.push_back(ResList[NumVec - 1]); + + ResList = TmpList; + NumVec = ResList.size(); + } while (NumVec > 1); + + return ResList[0]; +} + +// Try to vectorize the interleave group that \p Instr belongs to. +// +// E.g. Translate following interleaved load group (Delta is 3): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; // Member of index 0 +// G = Pic[i+1]; // Member of index 1 +// B = Pic[i+2]; // Member of index 2 +// ... // do something to R, G, B +// } +// To: +// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B +// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// +// Or translate following interleaved store group (Delta is 3): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; // Member of index 0 +// Pic[i+1] = G; // Member of index 1 +// Pic[i+2] = B; // Member of index 2 +// } +// To: +// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements +// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { + const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + assert(Group && "Fail to get an interleaved access group."); + + // Skip if current instruction is not the insert position. + if (Instr != Group->getInsertPos()) + return; + + LoadInst *LI = dyn_cast(Instr); + StoreInst *SI = dyn_cast(Instr); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + unsigned Delta = Group->getDelta(); + Type *VecTy = VectorType::get(ScalarTy, Delta * VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + + // Prepare for the new pointers. + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + unsigned Index = Group->getIndex(Instr); + for (unsigned Part = 0; Part < UF; Part++) { + // Extract the pointer for current instruction from the pointer vector. A + // reverse access uses the pointer in the last lane. + Value *NewPtr = Builder.CreateExtractElement( + PtrParts[Part], + Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0)); + + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + + // Cast to the vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + Value *UndefVec = UndefValue::get(VecTy); + + // Vectorize the interleaved load group. + if (LI) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *NewLoadInstr = Builder.CreateAlignedLoad( + NewPtrs[Part], Group->getAlignment(), "wide.vec"); + + for (unsigned i = 0; i < Delta; i++) { + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) + continue; + + Constant *StrideMask = getStridedMask(Builder, i, Delta, VF); + Value *StridedVec = Builder.CreateShuffleVector( + NewLoadInstr, UndefVec, StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); + } + + VectorParts &Entry = WidenMap.get(Member); + Entry[Part] = + Group->isReverse() ? reverseVector(StridedVec) : StridedVec; + } + + propagateMetadata(NewLoadInstr, Instr); + } + return; + } + + // The sub vector type for current instruction. + VectorType *SubVT = VectorType::get(ScalarTy, VF); + + // Vectorize the interleaved store group. + for (unsigned Part = 0; Part < UF; Part++) { + // Collect the stored vector from each member. + SmallVector StoredVecs; + for (unsigned i = 0; i < Delta; i++) { + // Interleaved store group doesn't allow a gap, so each index has a member + Instruction *Member = Group->getMember(i); + assert(Member && "Fail to get a member from an interleaved store group"); + + Value *StoredVec = + getVectorValue(dyn_cast(Member)->getValueOperand())[Part]; + if (Group->isReverse()) + StoredVec = reverseVector(StoredVec); + + // If this member has different type, cast it to an unified type. + if (StoredVec->getType() != SubVT) + StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); + + StoredVecs.push_back(StoredVec); + } + + // Concatenate all vectors into a wide vector. + Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + + // Interleave the elements in the wide vector. + Constant *IMask = getInterleavedMask(Builder, VF, Delta); + Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, + "interleaved.vec"); + + Instruction *NewStoreInstr = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + propagateMetadata(NewStoreInstr, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1664,6 +2152,10 @@ assert((LI || SI) && "Invalid Load/Store instruction"); + // Try to vectorize the interleave group if this access is interleaved. + if (Legal->isAccessInterleaved(Instr)) + return vectorizeInterleaveGroup(Instr); + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); @@ -3408,6 +3900,10 @@ "") <<"!\n"); + // Analyze interleaved memory accesses. + if (EnableInterleavedMemAccesses) + InterleaveInfo.analyzeInterleaving(Strides); + // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with // no restrictions. @@ -3923,6 +4419,165 @@ return true; } +void InterleavedAccessInfo::collectConstStridedAccesses( + MapVector &StrideAccesses, + const ValueToValueMap &Strides) { + // Holds load/store instructions in program order. + SmallVector AccessList; + + for (auto *BB : TheLoop->getBlocks()) { + bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); + + for (auto &I : *BB) { + if (!isa(&I) && !isa(&I)) + continue; + // FIXME: Currently we can't handle mixed accesses and predicated accesses + if (IsPred) + return; + + AccessList.push_back(&I); + } + } + + if (AccessList.empty()) + return; + + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (auto I : AccessList) { + LoadInst *LI = dyn_cast(I); + StoreInst *SI = dyn_cast(I); + + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + + // Ignore non-stride, unit strides and too large strides. + if (std::abs(Stride) < 2 || + std::abs(Stride) > static_cast(MaxInterleaveStride)) + continue; + + const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); + + // An alignment of 0 means target ABI alignment. + unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(PtrTy->getElementType()); + + StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); + } +} + +// Analyze interleaved accesses and collect them into interleave groups. +// +// Notice that the vectorization on interleaved groups will change instruction +// orders and may break dependences. But the memory dependence check guarantees +// that there is no overlap between two pointers of different strides, element +// sizes or underlying bases. +// +// For pointers sharing the same stride, element size and underlying base, no +// need to worry about Read-After-Write dependences and Write-After-Read +// dependences. +// +// E.g. The RAW dependence: A[i] = a; +// b = A[i]; +// This won't exist as it is a store-load forwarding conflict, which has +// already been checked and forbidden in the dependence check. +// +// E.g. The WAR dependence: a = A[i]; // (1) +// A[i] = b; // (2) +// The store group of (2) is always inserted at or below (2), and the load group +// of (1) is always inserted at or above (1). The dependence is safe. +void InterleavedAccessInfo::analyzeInterleaving( + const ValueToValueMap &Strides) { + DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); + + // Holds all the stride accesses. + MapVector StrideAccesses; + collectConstStridedAccesses(StrideAccesses, Strides); + + if (StrideAccesses.empty()) + return; + + // Holds all interleaved store groups temporarily. + SmallSetVector StoreGroups; + + // Search the load-load/write-write pair B-A in bottom-up order and try to + // insert B into the interleave group of A according to 3 rules: + // 1. A and B have the same stride. + // 2. A and B have the same memory object size. + // 3. B belongs to the group according to the distance. + // + // The bottom-up order can avoid breaking the Write-After-Write dependences + // between two pointers of the same base. + // E.g. A[i] = a; (1) + // A[i] = b; (2) + // A[i+1] = c (3) + // We form the group (2)+(3) in front, so (1) has to form groups with accesses + // above (1), which guarantees that (1) is always above (2). + for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E; + ++I) { + Instruction *A = I->first; + StrideDescriptor DesA = I->second; + + InterleaveGroup *Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + + if (A->mayWriteToMemory()) + StoreGroups.insert(Group); + + for (auto II = std::next(I); II != E; ++II) { + Instruction *B = II->first; + StrideDescriptor DesB = II->second; + + // Ignore if B is already in a group or B is a different memory operation. + if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory()) + continue; + + // Check the rule 1 and 2. + if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) + continue; + + // Calculate the distance and prepare for the rule 3. + const SCEVConstant *DistToA = + dyn_cast(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + if (!DistToA) + continue; + + int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + + // Previous dependence check guarantees no dependence between two accesses + // if the distance is not multiple of the size. So just ignore such cases + // and won't worry about dependences. + if (DistanceToA % static_cast(DesA.Size)) + continue; + + // The index of B is the index of A plus the related index to A. + int IndexB = + Group->getIndex(A) + DistanceToA / static_cast(DesA.Size); + + // Try to insert B into the group. + if (Group->insertMember(B, IndexB, DesB.Align)) { + DEBUG(dbgs() << "LV: Inserted:" << *B << '\n' + << " into the interleave group with" << *A << '\n'); + InterleaveGroupMap[B] = Group; + + // Set the first load in program order as the insert position. + if (B->mayReadFromMemory()) + Group->setInsertPos(B); + } + } // Iteration on instruction B + } // Iteration on instruction A + + // Remove interleaved store groups with gaps. + for (InterleaveGroup *Group : StoreGroups) + if (Group->getNumMembers() != Group->getDelta()) + releaseGroup(Group); +} + LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize @@ -4575,6 +5230,46 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // For an interleaved access, calculate the total cost of the whole + // interleave group. + if (Legal->isAccessInterleaved(I)) { + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + // Only calculate the cost once at the insert position. + if (Group->getInsertPos() != I) + return 0; + + unsigned Delta = Group->getDelta(); + Type *WideVecTy = + VectorType::get(VectorTy->getVectorElementType(), + VectorTy->getVectorNumElements() * Delta); + + // Holds the indices of existing members in an interleaved load group. + // An interleaved store group doesn't need this as it dones't allow gaps. + SmallVector Indices; + if (LI) { + for (unsigned i = 0; i < Delta; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getDelta(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) + Cost += + Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + + // FIXME: The interleaved load group with a huge gap could be even more + // expensive than scalar operations. Then we could ignore such group and + // use scalar operations instead. + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll =================================================================== --- /dev/null +++ test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -0,0 +1,540 @@ +; RUN: opt -loop-accesses -analyze < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Following cases are no dependence. + +; void nodep_Read_Write(int *A) { +; int *B = A + 1; +; for (unsigned i = 0; i < 1024; i+=3) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'nodep_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define void @nodep_Read_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 1 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int nodep_Write_Read(int *A) { +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=4) { +; A[i] = i; +; sum += A[i+3]; +; } +; +; return sum; +; } + +; CHECK: function 'nodep_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define i32 @nodep_Write_Read(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 3 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void nodep_Write_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; A[i+1] = i+1; +; } +; } + +; CHECK: function 'nodep_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Run-time memory checks: + +define void @nodep_Write_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following cases are unsafe depdences and are not vectorizable. + +; void unsafe_Read_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=3) +; A[i+3] = A[i] + 1; +; } + +; CHECK: function 'unsafe_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %0 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx3, align 4 + +define void @unsafe_Read_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %idxprom = zext i32 %i.010 to i64 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %add1 = add i32 %i.010, 3 + %idxprom2 = zext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2 + store i32 %add, i32* %arrayidx3, align 4 + %cmp = icmp ult i32 %add1, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int unsafe_Write_Read(int *A) { +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=4) { +; A[i] = i; +; sum += A[i+4]; +; } +; +; return sum; +; } + +; CHECK: function 'unsafe_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 + +define i32 @unsafe_Write_Read(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %1, %sum.013 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void unsafe_Write_Write(int *A) { +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; A[i+2] = i+1; +; } +; } + +; CHECK: function 'unsafe_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %2, i32* %arrayidx3, align 4 + +define void @unsafe_Write_Write(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx3, align 4 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following cases check that strided accesses can be vectorized. + +; void vectorizable_Read_Write(int *A) { +; int *B = A + 4; +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'vectorizable_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %0 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @vectorizable_Read_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int vectorizable_Write_Read(int *A) { +; int *B = A + 4; +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; sum += B[i]; +; } +; +; return sum; +; } + +; CHECK: function 'vectorizable_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 + +define i32 @vectorizable_Write_Read(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void vectorizable_Write_Write(int *A) { +; int *B = A + 4; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; B[i] = i+1; +; } +; } + +; CHECK: function 'vectorizable_Write_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %2, i32* %arrayidx2, align 4 + +define void @vectorizable_Write_Write(i32* nocapture %A) { +entry: + %add.ptr = getelementptr inbounds i32, i32* %A, i64 4 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %add.ptr, i64 %indvars.iv + %2 = trunc i64 %1 to i32 + store i32 %2, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void vectorizable_unscaled_Read_Write(int *A) { +; int *B = (int *)((char *)A + 14); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; FIXME: This case looks like previous case @vectorizable_Read_Write. It sould +; to be vectorizable. + +; CHECK: function 'vectorizable_unscaled_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @vectorizable_unscaled_Read_Write(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 14 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; int vectorizable_unscaled_Write_Read(int *A) { +; int *B = (int *)((char *)A + 17); +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=2) { +; A[i] = i; +; sum += B[i]; +; } +; +; return sum; +; } + +; CHECK: for function 'vectorizable_unscaled_Write_Read': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: store i32 %2, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %3 = load i32, i32* %arrayidx2, align 4 + +define i32 @vectorizable_unscaled_Write_Read(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 17 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = trunc i64 %indvars.iv to i32 + store i32 %2, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + %3 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %3, %sum.013 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; void unsafe_unscaled_Read_Write(int *A) { +; int *B = (int *)((char *)A + 11); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +; CHECK: function 'unsafe_unscaled_Read_Write': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +define void @unsafe_unscaled_Read_Write(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 11 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; CHECK: function 'unsafe_unscaled_Read_Write2': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %2 = load i32, i32* %arrayidx, align 4 -> +; CHECK-NEXT: store i32 %add, i32* %arrayidx2, align 4 + +; void unsafe_unscaled_Read_Write2(int *A) { +; int *B = (int *)((char *)A + 1); +; for (unsigned i = 0; i < 1024; i+=2) +; B[i] = A[i] + 1; +; } + +define void @unsafe_unscaled_Read_Write2(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %add.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %add.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %2, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Following case checks that interleaved stores have dependences with another +; store and can not pass dependence check. + +; void interleaved_stores(int *A) { +; int *B = (int *) ((char *)A + 1); +; for(int i = 0; i < 1024; i+=2) { +; B[i] = i; // (1) +; A[i+1] = i + 1; // (2) +; B[i+1] = i + 1; // (3) +; } +; } +; +; The access (2) has overlaps with (1) and (3). + +; CHECK: function 'interleaved_stores': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: store i32 %4, i32* %arrayidx5, align 4 -> +; CHECK-NEXT: store i32 %4, i32* %arrayidx9, align 4 +; CHECK: Backward: +; CHECK-NEXT: store i32 %2, i32* %arrayidx2, align 4 -> +; CHECK-NEXT: store i32 %4, i32* %arrayidx5, align 4 + +define void @interleaved_stores(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %incdec.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %2 = trunc i64 %indvars.iv to i32 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %2, i32* %arrayidx2, align 4 + %3 = or i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %3 + %4 = trunc i64 %3 to i32 + store i32 %4, i32* %arrayidx5, align 4 + %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3 + store i32 %4, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: test/Analysis/LoopAccessAnalysis/zero-distance-dependence.ll =================================================================== --- /dev/null +++ test/Analysis/LoopAccessAnalysis/zero-distance-dependence.ll @@ -0,0 +1,75 @@ +; RUN: opt -loop-accesses -analyze < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; This case check the store-load forwarding in a store-load pair with distance 0. + +; int dist_zero_true_dep (int *A) { +; int *B = A; +; int sum = 0; +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = i; +; sum += A[i]; +; } +; return sum; +; } + +; CHECK: function 'dist_zero_true_dep': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop +; CHECK-NEXT: Interesting Dependences: +; CHECK-NEXT: ForwardButPreventsForwarding: +; CHECK-NEXT: store i32 %0, i32* %arrayidx, align 4 -> +; CHECK-NEXT: %1 = load i32, i32* %arrayidx2, align 4 + +define i32 @dist_zero_true_dep(i32* nocapture %A) { +entry: + %B = getelementptr inbounds i32, i32* %A, i64 0 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.010 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %sum.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; This case check no dependence in two accesses with 0 distance. + +; int dist_zero_not_true_dep (int *A) { +; for (unsigned i = 0; i < 1024; i+=2) +; A[i] += i; +; } + +; CHECK: function 'dist_zero_not_true_dep': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe + +define i32 @dist_zero_not_true_dep(i32* nocapture %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 undef + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add i32 %0, %1 + store i32 %add, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,5 +1,5 @@ -; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses=true | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaved-mem-accesses=true | FileCheck %s --check-prefix=FORCE-VEC target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" @@ -102,26 +102,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -0,0 +1,422 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check vectorization on an interleaved load group of Delta 2 and an interleaved +; store group of Delta 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +%struct.ST2 = type { i32, i32 } +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of Delta 3 and an interleaved +; store group of Delta 3. + +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +%struct.ST3 = type { i32, i32, i32 } +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of Delta 4. + +; struct ST4{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST4 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + +; Check vectorization on an interleaved store group of Delta 4. + +; void test_struct_store4(int *A, struct ST4 *B) { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; B[i].x = X + 1; +; B[i].y = X * 2; +; B[i].z = X + 3; +; B[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 + %0 = load i32, i32* %ptr.024, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %add3 = add nsw i32 %0, 3 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 + store i32 %add3, i32* %z, align 4 + %add6 = add nsw i32 %0, 4 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 + store i32 %add6, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization on a reverse interleaved load group of Delta 2 and +; a reverse interleaved store group of Delta 2. + +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of Delta 2 with 1 gap +; (missing the load of odd elements). + +; void even_load(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK-LABEL: @even_load( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shl nsw <4 x i32> %strided.vec, +define void @even_load(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + +; CHECK-LABEL: @mixed_load_store( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec +define void @mixed_load_store(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %2, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %4, %3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups with members having different +; kinds of type. + +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + +; CHECK-LABEL: @int_float_struct( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> +; CHECK: add nsw <4 x i32> +; CHECK: fadd fast <4 x float> + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4 + %add = add nsw i32 %0, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 + %1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "unsafe-fp-math"="true" }