Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,11 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of a member of the interleaved Load/Store. + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +587,10 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +749,12 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment, + AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,20 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) { + VectorType *VecType = dyn_cast(VecTy); + assert(VecType && "Expect a vector type"); + + // Cost for the sequential memory access for a vector + unsigned Cost = 1; + // Estimate the interleave cost to be extracting all lanes or inserting all + // lanes. + Cost += VecType->getNumElements(); + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -530,6 +530,7 @@ if (Lp != AR->getLoop()) { DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; } // The address calculation must not wrap. Otherwise, a dependence could be @@ -777,21 +778,29 @@ VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + unsigned NumIterations = ForcedFactor * ForcedUnroll; + // The number of vectorized & interleaved iterations should not be less than 2 + if (NumIterations < 2) + NumIterations = 2; + + unsigned Stride = std::abs(StrideAPtr); + // Positive distance is safe when either of: + // (1) Distance < Stride * TypeByteSize + // (2) Distance >= TypeByteSize * NumIterations * Stride + // is true and + // (3) Distance <= MaxSafeDepDistBytes + // is true. + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIterations * Stride) || + Distance > MaxSafeDepDistBytes) { DEBUG(dbgs() << "LAA: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return Dependence::Backward; } // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + if (Distance >= TypeByteSize * NumIterations * Stride) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Stride, unsigned Alignment, + unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment, + AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -134,6 +134,37 @@ "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); +/// This enables vectorization on the interleaved memory accesses in code like +/// the following. +/// for (i = 0; i < N; i+=3) { +/// R = Pic[i]; // Load R color elements +/// G = Pic[i+1]; // Load G color elements +/// B = Pic[i+2]; // Load B color elements +/// ... // do something to R, G, B +/// } +/// Will be translated to a wide vector load and several shufflevectors +/// %wide.vec = load <12 x i32>, <12 x i32>* %ptr ; load for R,G,B +/// %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9> ; mask for R +/// %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10> ; mask for G +/// %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11> ; mask for B +/// +/// for (i = 0; i < N; i+=3) { +/// ... do something to R, G, B +/// Pic[i] = R; // Store R color elements +/// Pic[i+1] = G; // Store G color elements +/// Pic[i+2] = B; // Store B color elements +/// } +/// Will be translated to several shufflevectors and a wide vector store +/// %RG.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7> +/// %BU.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +/// %interleave = shufflevector %RG.vec, %BU.vec, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; mask for interleaved store +/// store <12 x i32> %interleave, <12 x i32>* %ptr ; write for R,G,B +static cl::opt + VectorizeInterleavedAccess("vectorize-interleaved-access", cl::init(true), + cl::Hidden, + cl::desc("Vectorize interleaved memory access")); + /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; @@ -351,6 +382,8 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + void vectorizeInterleavedAccess(Instruction *InsPos, Value *Ptr, Type *VecTy); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -568,6 +601,15 @@ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} + ~LoopVectorizationLegality() { + SmallSet DelSet; + // Collecting all the pointers to avoid delete a pointer twice, + for (auto &I : InterleavedAccessMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + /// This enum represents the kinds of inductions that we support. enum InductionKind { IK_NoInduction, ///< Not an induction variable. @@ -640,6 +682,56 @@ ConstantInt *StepValue; }; + // An interleaved access list contains a group of interleaved accesses. + // E.g. for (i = 0; i < N; i+=2) { + // even = A[i]; + // odd = A[i + 1]; + // ... // operations on even/odd + // } + // The List consists of two accesses: even load, odd load. + struct InterleavedAccess { + InterleavedAccess(int Stride, bool IsLoad) + : Stride(Stride), IsLoad(IsLoad) {} + InterleavedAccess() : Stride(0) {} + + unsigned getIndex(Instruction *Access) { + assert(Members.count(Access) && + "The interleaved access doesn't contain such member"); + + for (unsigned i = 0; i < Members.size(); i++) + if (Members[i] == Access) + return i; + return Members.size(); + } + + unsigned getNumMembers() { return Members.size(); } + + bool isInsertPos(Instruction *Access) { return Access == InsertPos; } + + bool isReverse() { return Stride < 0; } + + int Stride; + bool IsLoad; + unsigned Align; + // The interleaved access list. + SmallSetVector Members; + + // To avoid breaking the dependence, the new vectorized interleaved access + // should be inserted at the position of either: + // (1) The first load access. + // or + // (2) The last store access. + // + // E.g. %even = load i32 // Insert Position (First load) + // %add = add i32 %even // Use of %even + // %odd = load i32 // Should not insert here + // + // store i32 %even // Should not insert here + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position (Last store) + Instruction *InsertPos; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; @@ -730,6 +822,16 @@ unsigned getNumPredStores() const { return NumPredStores; } + + bool belongsToInterleavedAccess(Instruction *I) { + return InterleavedAccessMap.count(I); + } + InterleavedAccess *getInterleavedAccess(Instruction *I) { + assert(belongsToInterleavedAccess(I) && + "The given access doesn't belong to any interleaved access"); + return InterleavedAccessMap[I]; + } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -762,7 +864,21 @@ /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAccess(Value *LoadOrStoreInst); + void collectInvarStridedAccess(Value *LoadOrStoreInst); + + /// Analyze interleaved accesses. + void analyzeInterleavedAccess(); + + /// Collect constant strided memory accesses. + /// + /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1). + void + collectConstStridedAccess(MapVector &StridedAccesses); + + /// Find all interleaved accesses in a queue of loads/stores with the same + /// stride and base pointer. + void collectInterleavedAccess(SmallVector &Queue, + int Stride, bool IsLoad); /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. These are handled as LoopAccessReport rather than @@ -822,6 +938,10 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + /// A map contains the relationships between the members of an interleaved + /// access and the interleaved access. + DenseMap InterleavedAccessMap; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1657,6 +1777,211 @@ "reverse"); } +// Get the mask to interleave a wide vector concatenated from several small +// vectors like: V0, V1, V2, ... +// The mask is: +// +// which is equal to <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors and VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting: +// +// This can be used to interleave a wide vector into several small vectors. +// E.g. To interleaved a wide vector of <8 x i32> type: +// %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6> +// %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7> +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consist of sequential constants, +// The second part consist of UNDEFs. +// I.E. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Mask.push_back(Builder.getInt32(i)); + // Return directly if there is no undef value. + if (NumUndef == 0) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 1st vector always +// has the same number of elements as the 2nd vector or more elements. +// If the 1st vector has more elements, extend the 2nd vector with UNDEFs. +static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + if (NumElts1 == NumElts2) { + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); + } + + assert(NumElts1 > NumElts2 && "Expect the first vector has more elements"); + // Extend V2 with undef elements. + Constant *ExtMask = + getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2); + Value *Undef = UndefValue::get(VecTy2); + Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask); + + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, ExtV, Mask); +} + +// Concatenate number of vectors in the give list starting from Idx +static Value *ConcateVectors(IRBuilder<> &Builder, + SmallVector &List, unsigned Idx, + unsigned NumVec) { + assert(NumVec != 0 && "Number of vectors should not be zero"); + if (NumVec == 1) + return List[Idx]; + + unsigned PartNum = PowerOf2Floor(NumVec); + // When the number of vectors is not power of 2, split the list and make sure + // the first list has power of 2 elements. + // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements. + if (PartNum == NumVec) + PartNum /= 2; + Value *V0 = ConcateVectors(Builder, List, Idx, PartNum); + Value *V1 = ConcateVectors(Builder, List, Idx + PartNum, NumVec - PartNum); + return ConcateTwoVectors(Builder, V0, V1); +} + +// Create the vectorized instruction for interleaved access. +// E.g. Transform interleaved load with 2 members: +// %s1 = load i32, i32* %ptr1 +// %s2 = load i32, i32* %ptr2 +// To +// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +// %v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> +// %v2 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> +// +// Transform interleaved store with 3 members: +// store i32 %s1, i32 %ptr1 +// store i32 %s2, i32 %ptr2 +// store i32 %s3, i32 %ptr3 +// To +// %v1_v2 = shufflevector <4 x i32> %v1, %v2, <0, 1, 2, 3, 4, 5, 6, 7> +// %v3_und = shufflevector <4 x i32> %v3, undef, <0, 1, 2, 3, 4, 5, 6, 7> +// %interleaved.vec = shufflevector %v1_v2, %v3_und, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +// store <12 x i32> %interleaved.vec, <12 x i32> %ptr +void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr, + Value *Ptr, Type *DataTy) { + VectorType *VecTy = dyn_cast(DataTy); + assert(VecTy && "expect a vector type"); + + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + assert(IA->InsertPos == Instr && "Must be the insert position"); + + unsigned Idx = IA->getIndex(Instr); + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + SmallSetVector &Members = IA->Members; + unsigned Align = IA->Align; + if (!Align) { + const DataLayout &DL = Instr->getModule()->getDataLayout(); + Align = DL.getABITypeAlignment(DataTy); + } + unsigned NumVec = IA->getNumMembers(); + Type *WideVecTy = VectorType::get(VecTy->getElementType(), NumVec * VF); + Type *WidePtrTy = WideVecTy->getPointerTo(AddressSpace); + + Constant *Zero = Builder.getInt32(0); + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + bool Reverse = IA->isReverse(); + for (unsigned Part = 0; Part < UF; Part++) { + Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero); + // Adjust pointer to the access of index 0. + NewPtr = + Builder.CreateGEP(NewPtr, Builder.getInt32(-static_cast(Idx))); + + // If the address is reversed, then the wide load/store needs to start at + // the last access of index 0. + // E.g. For the reversed access of interleaved load for 2 members: + // {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]} + // The NewPtr is pointed to A[i]. Need to adjust to A[i-6]. + if (Reverse) + NewPtr = Builder.CreateGEP( + NewPtr, Builder.getInt32(-static_cast(VF * NumVec - NumVec))); + + // Bit cast to the new wide vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, WidePtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + if (IA->IsLoad) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *WideLoad = + Builder.CreateAlignedLoad(NewPtrs[Part], Align, "wide.vec"); + + // Interleave and split the loaded wide vector into small vectors. + Value *Undef = UndefValue::get(WideVecTy); + for (unsigned i = 0; i < NumVec; i++) { + VectorParts &Entry = WidenMap.get(Members[i]); + Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF); + Value *NewV = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask, + "strided.vec"); + Entry[Part] = Reverse ? reverseVector(NewV) : NewV; + } + + propagateMetadata(WideLoad, Instr); + } + return; + } + + for (unsigned Part = 0; Part < UF; Part++) { + SmallVector StoredVecs; + for (unsigned i = 0; i < NumVec; i++) { + Value *StoredVec = getVectorValue(Members[i]->getOperand(0))[Part]; + if (Reverse) + StoredVec = reverseVector(StoredVec); + StoredVecs.push_back(StoredVec); + } + // Concatenate the apart vectors into a wide vector. + Value *WideVec = ConcateVectors(Builder, StoredVecs, 0, NumVec); + + Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec); + // Interleave the data in the wide vector. + Value *IVec = Builder.CreateShuffleVector( + WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec"); + + Instruction *WideStore = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align); + propagateMetadata(WideStore, Instr); + } + return; +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1684,6 +2009,15 @@ if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); + if (Legal->belongsToInterleavedAccess(Instr)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + // Skip the access if it is not the insert position. + if (!IA->isInsertPos(Instr)) + return; + return vectorizeInterleavedAccess(Instr, Ptr, DataTy); + } + // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3414,6 +3748,344 @@ return true; } +static bool isInBoundsGep(Value *Ptr) { + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + return GEP->isInBounds(); + return false; +} + +/// \brief Check whether the access through \p Ptr has a constant stride. +static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const SCEV *&PtrScev) { + const Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Unexpected non-ptr"); + const PointerType *PtrTy = cast(Ty); + + PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PtrScev); + // The pointer must be an AddRecExpr pointer and the accesss function must + // stride over the innermost loop. + if (!AR || Lp != AR->getLoop()) + return 0; + + // The address calculation must not wrap. Otherwise, a dependence could be + // inverted. + // An inbounds getelementptr that is a AddRec with a unit stride + // cannot wrap per definition. The unit stride requirement is checked later. + // An getelementptr without an inbounds attribute and unit stride would have + // to access the pointer value "0" which is undefined behavior in address + // space 0, therefore we can also vectorize this case. + bool IsInBoundsGEP = isInBoundsGep(Ptr); + bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); + bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; + if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) + return 0; + + // Check the step is constant. + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) + return 0; + + auto &DL = Lp->getHeader()->getModule()->getDataLayout(); + int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return 0; + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) + return 0; + + // If the SCEV could wrap but we have an inbounds gep with a unit stride we + // know we can't "wrap around the address space". In case of address space + // zero we know that this won't happen without triggering undefined behavior. + if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && + Stride != 1 && Stride != -1) + return 0; + + return Stride; +} + +static unsigned getAddressSpaceOperand(Instruction *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +static Value *getPointerOperand(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +static Value *getBasePointer(Instruction *I, const DataLayout &DL) { + Value *Ptr = getPointerOperand(I); + assert(Ptr && "Fail to get point from a memory instruction"); + return GetUnderlyingObject(Ptr, DL); +} + +static bool isConsecutiveAccess(Instruction *A, Instruction *B, + ScalarEvolution *SE, const DataLayout &DL) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + APInt OffsetDelta = OffsetB - OffsetA; + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(BaseDelta); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +void LoopVectorizationLegality::collectConstStridedAccess( + MapVector &StridedAccesses) { + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Value *Ptr = getPointerOperand(it); + if (!Ptr) + continue; + + const SCEV *PtrScev; + int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev); + if (Stride < 2 && Stride > -2) + continue; + + DEBUG(dbgs() << "LV: Found a constant strided access\n"); + DEBUG(dbgs() << " pointer:" << *Ptr << "\n"); + DEBUG(dbgs() << " Stride:" << Stride << "\n"); + StridedAccesses[it] = Stride; + } + } +} + +void LoopVectorizationLegality::collectInterleavedAccess( + SmallVector &Queue, int Stride, bool IsLoad) { + if (Queue.size() < 2) { + Queue.clear(); + return; + } + + SetVector Heads; + SetVector Tails; + SmallDenseMap ConsecutiveChain; + + // Collect all the consecutive pairs. + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (unsigned i = 0, e = Queue.size(); i != e; i++) { + for (unsigned j = 0; j != e; j++) { + if (i == j) + continue; + if (isConsecutiveAccess(Queue[i], Queue[j], SE, DL)) { + Heads.insert(Queue[i]); + Tails.insert(Queue[j]); + ConsecutiveChain[Queue[i]] = Queue[j]; + } + } + } + + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + SmallSetVector ConsecutiveList; + Instruction *I = *it; + + // Collect a list with consecutive accesses. + while (Tails.count(I) || Heads.count(I)) { + ConsecutiveList.insert(I); + // Move to the next access in the chain. + I = ConsecutiveChain[I]; + } + + unsigned Size = ConsecutiveList.size(); + unsigned Num = std::abs(Stride); + if (Size < Num) + continue; + + // As every interleave access consists of the same number of members as the + // stride. There may be more than 1 interleaved access in this list. + for (unsigned Part = 0; Part < Size / Num; Part++) { + InterleavedAccess *IA = new InterleavedAccess(Stride, IsLoad); + SmallSetVector &Members = IA->Members; + unsigned MaxAlign = 0; + for (unsigned i = Part * Num; i < Part * Num + Num; i++) { + Instruction *Access = ConsecutiveList[i]; + // Insert each member. + Members.insert(Access); + // Record the relationship of the member and the interleaved access + InterleavedAccessMap[Access] = IA; + + unsigned Align = IsLoad ? dyn_cast(Access)->getAlignment() + : dyn_cast(Access)->getAlignment(); + if (Align > MaxAlign) + MaxAlign = Align; + } + // The alignment of the interleaved access is the max member alignment. + IA->Align = MaxAlign; + + // To avoid breaking the dependences, the new vectorized interleaved + // access has different insert positions for load and store. + if (IsLoad) { + for (unsigned i = 0, e = Queue.size(); i != e; i++) { + // Start from the first access in the queue. If an access is in the + // current interleaved access, it is the first access and set it to be + // the insert position. + if (Members.count(Queue[i])) { + IA->InsertPos = Queue[i]; + break; + } + } + } else { + for (int i = Queue.size() - 1, e = -1; i != e; i--) { + // Start from the last access in the queue. If an access is in the + // current interleaved access, it is the last access and set it to be + // the insert position. + if (Members.count(Queue[i])) { + IA->InsertPos = Queue[i]; + break; + } + } + } + + DEBUG(dbgs() << "LV: Found an interleaved accesses with Stride " << Stride + << "\n"); + } + } + + return; +} + +void LoopVectorizationLegality::analyzeInterleavedAccess() { + MapVector StridedAccesses; + collectConstStridedAccess(StridedAccesses); + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + + // The following logic groups the constant strided accesses and find the + // interleaved accesses in two steps: + // (1) Groups loads/stores with the same stride and base pointers into a + // queue. + // (2) Calls another function to collect all interleaved accesses in that + // queue. + // TODO: It can't collect interleaved access if the loads/stores are mixed. + // E.g. for (i = 0; i < N; i+=2) { + // A[i] += 1; + // A[i+1] += 2; + // } + // It will fail to identify the interleaved accesses as the mixed order: + // load A[i], store A[i], load A[i+1], store A[i+1]. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + // Queue for the loads/stores of the same stride and base pointer. + SmallVector AccessQueue; + bool TryMerge = false; + Value *CurBase = nullptr; + int CurStride; + bool IsLoad = false; + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + if (TryMerge) { + collectInterleavedAccess(AccessQueue, CurStride, IsLoad); + CurBase = nullptr; + TryMerge = false; + } + + if (!it->mayReadOrWriteMemory()) + continue; + + // A non-strided memory access may break the dependence. Merge the queue. + if (!StridedAccesses.count(it)) { + TryMerge = true; + continue; + } + + Value *Base = getBasePointer(it, DL); + int Stride = StridedAccesses[it]; + bool Read = it->mayReadFromMemory(); + + // Start a new queue + if (CurBase == nullptr) { + AccessQueue.clear(); + AccessQueue.push_back(it); + CurBase = Base; + CurStride = Stride; + IsLoad = Read; + continue; + } + + // This instructure doesn't fit the current queue. Try to merge current + // queue and insert it again. + // TODO: this may missing opportunities as it hasn't collected all + // loads/stores with the same base pointer and stride. + if (CurBase != Base || CurStride != Stride || IsLoad != Read) { + TryMerge = true; + --it; + continue; + } + + AccessQueue.push_back(it); + } + + // Collect the tail accesses. + if (CurBase != nullptr) { + collectInterleavedAccess(AccessQueue, CurStride, IsLoad); + CurBase = nullptr; + TryMerge = false; + } + } +} + static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -3601,12 +4273,12 @@ return false; } if (EnableMemAccessVersioning) - collectStridedAccess(ST); + collectInvarStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast(it)) - collectStridedAccess(LI); + collectInvarStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. @@ -3629,6 +4301,9 @@ } } + if (VectorizeInterleavedAccess) + analyzeInterleavedAccess(); + return true; } @@ -3744,7 +4419,7 @@ return Stride; } -void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { +void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) Ptr = LI->getPointerOperand(); @@ -3757,7 +4432,7 @@ if (!Stride) return; - DEBUG(dbgs() << "LV: Found a strided access that we can version"); + DEBUG(dbgs() << "LV: Found an invariable strided access that we can version"); DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); Strides[Ptr] = Stride; StrideSet.insert(Stride); @@ -4575,6 +5250,18 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // Interleaved memory loads/stores + if (Legal->belongsToInterleavedAccess(I)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(I); + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), VectorTy, IA->getNumMembers(), IA->Align, AS); + if (IA->isReverse()) + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -102,26 +102,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll @@ -0,0 +1,497 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2 + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; Make sure that we can vectorize the interleave access to a struct. +; struct ST2 { +; int x; +; int y; +; }; +; int test_struct_load2(struct ST2 *A) { +; int sum = 0; +; for (int i = 0; i < 1024; ++i) { +; sum += A[i].x; +; sum += A[i].y; +; } +; +; return sum; +; } + +; CHECK-LABEL: @test_struct_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> + +%struct.ST2 = type { i32, i32 } +define i32 @test_struct_load2(%struct.ST2* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %add = add nsw i32 %0, %sum + %add1 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %add1.lcssa = phi i32 [ %add1, %for.body ] + ret i32 %add1.lcssa +} + + +; Make sure that the following case can be vectorized. +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +; UF2-LABEL: @test_array_load2_store2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; }; +; int test_struct_load3(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> + +%struct.ST3 = type { i32, i32, i32 } + +define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ] + %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.015 + %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add5 +} + + +; Make sure that the following case can be vectorized. +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_array_load3_store3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> + +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +; UF2-LABEL: @test_struct_load4( +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_store4( +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; This case checks that reversed interleaved loads can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; int test_reversed_load2(struct ST2 *A) { +; int add = 0; +; int sub = 0; +; for (int i = 1023; i >= 0; i--) { +; add += A[i].x; +; sub -= A[i].y; +; } +; return add*sub; +; } + +; CHECK-LABEL: @test_reversed_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> + +define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %mul = mul nsw i32 %sub4, %add1 + ret i32 %mul + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ] + %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add1 = add nsw i32 %0, %add.014 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub4 = sub nsw i32 %sub.015, %1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; This case checks that reversed interleaved loads/stores can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +}