Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -499,6 +499,11 @@ const ValueToValueMap &PtrToStride, Value *Ptr, Value *OrigPtr = nullptr); +/// \brief Check the stride of the pointer and ensure that it does not wrap in +/// the address space. +int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const ValueToValueMap &StridesMap); + /// \brief This analysis provides dependence information for the memory accesses /// of a loop. /// Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,14 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of one member in the interleaved access. + /// \p Vecty is the wide vecotor type of the whole interleaved access. + /// \p SubTy is the vector type of the member. + /// \p Index is the index of current member. + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +590,10 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + Type *SubTy, unsigned Index, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +752,12 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, + Alignment, AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,12 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -522,6 +522,50 @@ return Cost; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Alignment, + unsigned AddressSpace) { + VectorType *VT = dyn_cast(VecTy); + VectorType *SubVT = dyn_cast(SubTy); + assert(VT && SubVT && "Expect vector types"); + + unsigned NumElts = VT->getNumElements(); + unsigned SubNumElts = SubVT->getNumElements(); + unsigned Stride = NumElts / SubNumElts; + assert(NumElts % SubNumElts == 0 && Stride > 1 && "Invalide vector types"); + assert(Index < Stride && "Invalid Index"); + + // The load/store cost is one part of the wide vector memory op cost. + unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + Cost = Cost / Stride; + + // Then need to calculate the cost for interleave operations. + if (Opcode == Instruction::Load) { + // E.g. Interleaved load for the vector %V1 of index 1: + // %V = load <4 x i32>, <4 x i32>* %ptr + // %V1 = shufflevector <4 x i32> %V, <4 x i32> undef, <1, 3> + // The cost is extract element 1 and element 3 from the wide vector, and + // insert them sequentially into the sub vector. + for (unsigned i = 0; i < SubNumElts; i++) { + Cost += getVectorInstrCost(Instruction::ExtractElement, VecTy, + Index + i * Stride); + Cost += getVectorInstrCost(Instruction::InsertElement, SubTy, i); + } + } else { + // E.g. Interleaved store for the vector %V1 of index 1: + // %IV = shufflevector <2 x i32> %V0, <2 x i32> %V1, <0, 2, 1, 3> + // store <4 x i32> %IV, <4 x i32> %ptr + // The cost is extract element 0 and element 1 from %V1, and insert them + // into lane 1 and lane 3 of the wide vector. + for (unsigned i = 0; i < SubNumElts; i++) { + Cost += getVectorInstrCost(Instruction::ExtractElement, SubTy, i); + Cost += getVectorInstrCost(Instruction::InsertElement, VecTy, + Index + i * Stride); + } + } + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; Index: include/llvm/Transforms/Utils/VectorUtils.h =================================================================== --- include/llvm/Transforms/Utils/VectorUtils.h +++ include/llvm/Transforms/Utils/VectorUtils.h @@ -17,6 +17,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/Analysis/ScalarEvolution.h" namespace llvm { @@ -200,6 +201,11 @@ return Intrinsic::not_intrinsic; } +Value *getPointerOperand(Value *I); + +bool isConsecutiveAccess(Value *A, Value *B, ScalarEvolution *SE, + const DataLayout &DL); + } // llvm namespace #endif Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -284,11 +284,6 @@ return AR->isAffine(); } -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap); - bool AccessAnalysis::canCheckPtrAtRT( LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap, @@ -504,8 +499,8 @@ } /// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap) { +int llvm::isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const ValueToValueMap &StridesMap) { const Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -770,28 +765,39 @@ return Dependence::Unknown; } - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? - VectorizerParams::VectorizationFactor : 1); - unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? - VectorizerParams::VectorizationInterleave : 1); + unsigned ForcedFactor = VectorizerParams::VectorizationFactor; + unsigned ForcedUnroll = VectorizerParams::VectorizationInterleave; + // The number of iterations to be vectorized and unrolled. + // If one passed-in parameter is not forced, notice that the number is 0. Then + // we need to adjust it. + unsigned NumIter = ForcedFactor * ForcedUnroll; + if (!NumIter) { + NumIter = ForcedFactor ? ForcedFactor : ForcedUnroll; + // It could be 0 (if both parameters are not forced) or 1 (if the + // forced parameter is 1). Then we set it to be the minimum number: 2. + if (NumIter < 2) + NumIter = 2; + } - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + unsigned Distance = (unsigned)Val.getZExtValue(); + unsigned Stride = std::abs(StrideAPtr); + // Positive distance is safe when + // Distance <= MaxSafeDepDistBytes + // And either one below is true + // Distance < Stride * TypeByteSize + // Distance >= TypeByteSize * NumIter * Stride + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIter * Stride) || + Distance > MaxSafeDepDistBytes) { DEBUG(dbgs() << "LAA: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return Dependence::Backward; } - // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + // If Distance < Stride * TypeByteSize, it is always safe. just keep the + // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes. + if (Distance >= Stride * TypeByteSize) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, Type *SubTy, unsigned Index, + unsigned Alignment, unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, + Alignment, AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -37,6 +37,7 @@ UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp + VectorUtils.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms Index: lib/Transforms/Utils/VectorUtils.cpp =================================================================== --- /dev/null +++ lib/Transforms/Utils/VectorUtils.cpp @@ -0,0 +1,76 @@ +//===--------------- VectorUtils.cpp - Vectorizer utilities ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines some vectorizer utilities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/VectorUtils.h" + +namespace llvm { + +Value *getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +static unsigned getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool isConsecutiveAccess(Value *A, Value *B, ScalarEvolution *SE, + const DataLayout &DL) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + APInt OffsetDelta = OffsetB - OffsetA; + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(BaseDelta); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +} // End llvm namespace Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -134,6 +134,11 @@ "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); +/// This enables vectorization on the interleaved memory accesses +static cl::opt VectorizeInterleavedAccess( + "vectorize-interleaved-access", cl::init(false), cl::Hidden, + cl::desc("Vectorize interleaved memory access in a loop")); + /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; @@ -351,6 +356,11 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Vectorize the interleaved memory access. \p Instr is the member in insert + /// position. + void vectorizeInterleavedAccess(Instruction *Instr, Value *Ptr, + VectorType *VecTy); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -568,6 +578,15 @@ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} + ~LoopVectorizationLegality() { + SmallSet DelSet; + // Collecting all the pointers to avoid delete a pointer twice. + for (auto &I : InterleavedAccessMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + /// This enum represents the kinds of inductions that we support. enum InductionKind { IK_NoInduction, ///< Not an induction variable. @@ -640,6 +659,44 @@ ConstantInt *StepValue; }; + // An interleaved access consists of a group of interleaved member accesses. + // E.g. for (i = 0; i < N; i+=2) { + // even = A[i]; // Member of index 0 + // odd = A[i + 1]; // Member of index 1 + // ... + // } + struct InterleavedAccess { + InterleavedAccess(int Stride, bool Reverse, bool IsLoad) + : Stride(Stride), Reverse(Reverse), IsLoad(IsLoad) {} + InterleavedAccess() : Stride(0) {} + + unsigned getIndex(Instruction *Access) { + assert(Members.count(Access) && + "The interleaved access doesn't contain such member"); + + for (unsigned i = 0; i < Members.size(); i++) + if (Members[i] == Access) + return i; + llvm_unreachable("Contains no such member"); + } + + SmallSetVector Members; + unsigned Stride; + bool Reverse; + bool IsLoad; + unsigned Align; + // To avoid breaking the dependence, the new vectorized interleaved access + // should be inserted at either the first load or the last store. + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 // + // + // store i32 %even // + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; @@ -730,6 +787,16 @@ unsigned getNumPredStores() const { return NumPredStores; } + + bool belongsToInterleavedAccess(Instruction *I) { + return InterleavedAccessMap.count(I); + } + InterleavedAccess *getInterleavedAccess(Instruction *I) { + assert(belongsToInterleavedAccess(I) && + "Not a member of an interleaved access"); + return InterleavedAccessMap[I]; + } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -762,7 +829,21 @@ /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAccess(Value *LoadOrStoreInst); + void collectInvarStridedAccess(Value *LoadOrStoreInst); + + /// Analyze interleaved accesses. + void analyzeInterleavedAccess(); + + /// Collect constant strided memory access. + /// + /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1) + void + collectConstStridedAccess(MapVector &StridedAccesses); + + /// Find all interleaved accesses in \p Queue which consist of loads/stores + /// with the same stride \p Stride and base pointer. + void collectInterleavedAccess(SmallVector &Queue, + int Stride, bool IsLoad); /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. These are handled as LoopAccessReport rather than @@ -822,6 +903,9 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + /// Contains the relationships between the members and the interleaved access. + DenseMap InterleavedAccessMap; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1657,6 +1741,225 @@ "reverse"); } +// Get the mask to interleave a wide vector concatenated from several small +// vectors like: V0, V1, V2, ... +// The mask is: +// +// Which is equal to +// <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index Start: +// +// This can be used to interleave a wide vector into several small vectors. +// E.g. To interleave a wide vector of <8 x i32> type into two vectors: +// %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6> +// %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7> +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consist of sequential constants, +// The second part consist of UNDEFs. +// I.E. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Mask.push_back(Builder.getInt32(i)); + // Return directly if there is no undef value. + if (NumUndef == 0) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 1st vector always +// has the same number of elements as the 2nd vector or more elements. +// If the 1st vector has more elements, extend the 2nd vector with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, + Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + if (NumElts1 == NumElts2) { + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); + } + + assert(NumElts1 > NumElts2 && "Expect the first vector has more elements"); + // Extend V2 with undef elements. + Constant *ExtMask = + getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2); + Value *Undef = UndefValue::get(VecTy2); + Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask); + + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, ExtV, Mask); +} + +// Concatenate number of vectors in the give list starting from Idx +static Value *ConcatenateVectors(IRBuilder<> &Builder, + SmallVector &List, + unsigned StartIdx, unsigned NumVec) { + assert(NumVec != 0 && "Number of vectors should not be zero"); + if (NumVec == 1) + return List[StartIdx]; + + unsigned PartNum = PowerOf2Floor(NumVec); + // When the number of vectors is not power of 2, split the list and make sure + // the first list has power of 2 elements. + // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements. + if (PartNum == NumVec) + PartNum /= 2; + Value *V0 = ConcatenateVectors(Builder, List, StartIdx, PartNum); + Value *V1 = + ConcatenateVectors(Builder, List, StartIdx + PartNum, NumVec - PartNum); + return ConcatenateTwoVectors(Builder, V0, V1); +} + +// E.g. Translate following interleaved loads (VF is 4): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; // interleaved load of index 0 +// G = Pic[i+1]; // interleaved load of index 1 +// B = Pic[i+2]; // interleaved load of index 2 +// ... // do something to R, G, B +// } +// +// To: +// %wide.vec = load <12 x i32>, <12 x i32>* %ptr ; read R,G,B +// %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9> +// %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10> +// %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11> +// +// Or translate following interleaved stores (VF is 4): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; // interleaved store of index 0 +// Pic[i+1] = G; // interleaved store of index 1 +// Pic[i+2] = B; // interleaved store of index 2 +// } +// +// To: +// %R_G.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shufflevector %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +// store <12 x i32> %interleaved.vec, <12 x i32>* %ptr ; write R,G,B +void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr, + Value *Ptr, + VectorType *VecTy) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + assert(IA && IA->InsertPos == Instr && "Unexpected instruction"); + unsigned Idx = IA->getIndex(Instr); + SmallSetVector &Members = IA->Members; + unsigned NumVec = IA->Stride; + bool Reverse = IA->Reverse; + unsigned Align = IA->Align; + // Set the default alignment if there is no alignment. + if (!Align) { + const DataLayout &DL = Instr->getModule()->getDataLayout(); + Align = DL.getABITypeAlignment(VecTy); + } + + // Create the wide vector types for the wide load/store. + Type *WideVecTy = VectorType::get(VecTy->getElementType(), NumVec * VF); + Type *WidePtrTy = + WideVecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + + VectorParts &PtrParts = getVectorValue(Ptr); + Constant *Zero = Builder.getInt32(0); + SmallVector NewPtrs; + setDebugLocFromInst(Builder, Ptr); + // Adjust the new pointers. + for (unsigned Part = 0; Part < UF; Part++) { + Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero); + // Adjust the new pointer to point to the access of index 0. + NewPtr = + Builder.CreateGEP(NewPtr, Builder.getInt32(-static_cast(Idx))); + + // If the address is reversed, then the wide load/store needs to start at + // the last access of index 0. + // E.g. For the reversed access of interleaved load with 2 members: + // {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]} + // The new pointer is now pointed to A[i]. Adjust it to A[i-6]. + if (Reverse) + NewPtr = Builder.CreateGEP( + NewPtr, Builder.getInt32(-static_cast(VF * NumVec - NumVec))); + + // Bit cast to the new wide vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, WidePtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + // Vectorize interleaved loads. + if (IA->IsLoad) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *WideLoad = + Builder.CreateAlignedLoad(NewPtrs[Part], Align, "wide.vec"); + + // Interleave and split the loaded wide vector into small vectors. + Value *Undef = UndefValue::get(WideVecTy); + for (unsigned i = 0; i < NumVec; i++) { + Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF); + Value *NewV = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask, + "strided.vec"); + VectorParts &Entry = WidenMap.get(Members[i]); + Entry[Part] = Reverse ? reverseVector(NewV) : NewV; + } + + propagateMetadata(WideLoad, Instr); + } + return; + } + + // Vectorize interleaved stores. + for (unsigned Part = 0; Part < UF; Part++) { + SmallVector StoredVecs; + for (auto &I : Members) { + Value *StoredVec = getVectorValue(I->getOperand(0))[Part]; + if (Reverse) + StoredVec = reverseVector(StoredVec); + StoredVecs.push_back(StoredVec); + } + // Concatenate the apart vectors into a wide vector. + Value *WideVec = ConcatenateVectors(Builder, StoredVecs, 0, NumVec); + + Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec); + // Interleave the data in the wide vector. + Value *IVec = Builder.CreateShuffleVector( + WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec"); + + Instruction *WideStore = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align); + propagateMetadata(WideStore, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1684,6 +1987,15 @@ if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); + if (Legal->belongsToInterleavedAccess(Instr)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + // Skip the access if it is not the insert position. + if (IA->InsertPos != Instr) + return; + return vectorizeInterleavedAccess(Instr, Ptr, dyn_cast(DataTy)); + } + // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3414,6 +3726,213 @@ return true; } +void LoopVectorizationLegality::collectConstStridedAccess( + MapVector &StridedAccesses) { + for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE; + ++BB) { + if (blockNeedsPredication(*BB)) + continue; + + for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; I++) { + Value *Ptr = getPointerOperand(I); + if (!Ptr) + continue; + + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + if (Stride < 2 && Stride > -2) + continue; + + DEBUG(dbgs() << "LV: Found a constant strided access\n"); + DEBUG(dbgs() << " pointer:" << *Ptr << "\n"); + DEBUG(dbgs() << " Stride:" << Stride << "\n"); + StridedAccesses[I] = Stride; + } + } +} + +void LoopVectorizationLegality::collectInterleavedAccess( + SmallVector &Queue, int Stride, bool IsLoad) { + if (Queue.size() < 2) + return; + + SetVector Heads; + SetVector Tails; + SmallDenseMap ConsecutivePairs; + + // Collect all the consecutive pairs. + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (auto &H : Queue) { + for (auto &T : Queue) { + if (H == T || !isConsecutiveAccess(H, T, SE, DL)) + continue; + // If there is already Head or Tail, we have redundant accesses. This may + // result in incorrect results. Just return directly. + // E.g. Three writes on: + // A[i] (1) + // A[i] (2) + // A[i+1] (3) + // If we combine (1) and (3) together and generate code after (2): + // A[i] (2) + // A[i]+A[i+1] (1)(2) + // Then we get incorrect result. + if (Heads.count(H) || Tails.count(T)) + return; + + Heads.insert(H); + Tails.insert(T); + ConsecutivePairs[H] = T; + } + } + + for (auto &I : Heads) { + if (Tails.count(I)) + continue; + + SmallSetVector ConsecutiveChain; + // Collect a chain with consecutive accesses. + auto Inst = I; + while (Tails.count(Inst) || Heads.count(Inst)) { + ConsecutiveChain.insert(Inst); + // Move to the next access in the chain. + Inst = ConsecutivePairs[Inst]; + } + + unsigned ChainLen = ConsecutiveChain.size(); + // The length of the interleaved access list is the absolute value of stride + unsigned IALen = std::abs(Stride); + if (ChainLen < IALen) + continue; + + // There may be more than 1 interleaved access list in this chain. + for (unsigned Part = 0; Part < ChainLen / IALen; Part++) { + // If the Stride is negative, it is reversed. + bool Reverse = Stride < 0; + InterleavedAccess *IA = new InterleavedAccess(IALen, Reverse, IsLoad); + DEBUG(dbgs() << "LV: Found an interleaved accesses with Stride " + << IALen << "\n"); + + SmallSetVector &Members = IA->Members; + unsigned MinAlign = 0; + for (unsigned i = 0; i < IALen; i++) { + Instruction *Access = ConsecutiveChain[Part * IALen + i]; + Members.insert(Access); + // Record the relationship of the member and the interleaved access + InterleavedAccessMap[Access] = IA; + + unsigned Align = IsLoad ? dyn_cast(Access)->getAlignment() + : dyn_cast(Access)->getAlignment(); + if (!MinAlign) + MinAlign = Align; + else if (Align && Align < MinAlign) + MinAlign = Align; + } + // The alignment of the interleaved access is the min alignment. + IA->Align = MinAlign; + + // To avoid breaking the dependences, the new vectorized interleaved + // access has different insert positions for load and store. + if (IsLoad) { + for (auto &II : Queue) { + // Find the first load and set it to be the insert position. + if (Members.count(II)) { + IA->InsertPos = II; + break; + } + } + } else { + for (auto II = Queue.rbegin(), E = Queue.rend(); II != E; ++II) { + // Find the last store and set it to be the insert position. + if (Members.count(*II)) { + IA->InsertPos = *II; + break; + } + } + } + } // end of handling consecutive list. + } // end of handling Heads +} + +static Value *getBasePointer(Instruction *I, const DataLayout &DL) { + Value *Ptr = getPointerOperand(I); + assert(Ptr && "Fail to get point from a memory instruction"); + return GetUnderlyingObject(Ptr, DL); +} + +void LoopVectorizationLegality::analyzeInterleavedAccess() { + MapVector StridedAccesses; + collectConstStridedAccess(StridedAccesses); + + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + // The following code groups the constant strided accesses and find the + // interleaved accesses in two steps: + // (1) Groups loads/stores with the same stride and base pointer into a + // queue. + // (2) Calls another function to collect all interleaved accesses in that + // queue. + // TODO: It can't collect interleaved access if the loads/stores are mixed. + // E.g. for (i = 0; i < N; i+=2) { + // A[i] += 1; + // A[i+1] += 2; + // } + // It could fail to identify the interleaved accesses as the mixed order: + // load A[i], store A[i], load A[i+1], store A[i+1]. + for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE; + ++BB) { + if (blockNeedsPredication(*BB)) + continue; + + // Queue for the loads/stores of the same stride and base pointer. + SmallVector Queue; + Value *CurBase = nullptr; + unsigned CurStride = 0; + bool CurLoad = false; + for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I) { + if (!I->mayReadOrWriteMemory()) + continue; + + Value *Base = nullptr; + unsigned Stride = 0; + bool IsLoad = false; + if (StridedAccesses.count(I)) { + Base = getBasePointer(I, DL); + Stride = StridedAccesses[I]; + IsLoad = I->mayReadFromMemory(); + // If there is no current queue, start a new queue with this access. + if (!CurBase) { + Queue.push_back(I); + CurBase = Base; + CurStride = Stride; + CurLoad = IsLoad; + continue; + } + + // Add to the current queue if this access matches the stride, base, + // and memory operation type. + if (CurBase == Base && CurStride == Stride && CurLoad == IsLoad) { + Queue.push_back(I); + continue; + } + } + + // If this access can not be added into the queue, it may break the + // dependence. So try to analyze and clear current queue. + collectInterleavedAccess(Queue, CurStride, CurLoad); + Queue.clear(); + CurBase = Base; + // Continue if this access is not strided access. + if (!CurBase) + continue; + + Queue.push_back(I); + CurStride = Stride; + CurLoad = IsLoad; + } // End of handling BB + + // Collect the tailed accesses. + collectInterleavedAccess(Queue, CurStride, CurLoad); + } // End of handling TheLoop +} + static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -3601,12 +4120,12 @@ return false; } if (EnableMemAccessVersioning) - collectStridedAccess(ST); + collectInvarStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast(it)) - collectStridedAccess(LI); + collectInvarStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. @@ -3629,6 +4148,9 @@ } } + if (VectorizeInterleavedAccess) + analyzeInterleavedAccess(); + return true; } @@ -3744,7 +4266,7 @@ return Stride; } -void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { +void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) Ptr = LI->getPointerOperand(); @@ -3757,7 +4279,7 @@ if (!Stride) return; - DEBUG(dbgs() << "LV: Found a strided access that we can version"); + DEBUG(dbgs() << "LV: Found an invariable strided access that we can version"); DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); Strides[Ptr] = Stride; StrideSet.insert(Stride); @@ -4575,6 +5097,21 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // Interleaved memory loads/stores + if (Legal->belongsToInterleavedAccess(I)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(I); + Type *WideVecTy = + VectorType::get(VectorTy->getVectorElementType(), + VectorTy->getVectorNumElements() * IA->Stride); + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, VectorTy, IA->getIndex(I), IA->Align, AS); + if (IA->Reverse) + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -382,9 +382,6 @@ } } - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); - /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -412,14 +409,6 @@ /// vectorized, or NULL. They may happen in cycles. Value *alreadyVectorized(ArrayRef VL) const; - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(Type *Ty); @@ -1130,8 +1119,8 @@ return; } const DataLayout &DL = F->getParent()->getDataLayout(); - if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], SE, DL)) { + if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], SE, DL)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); @@ -1303,7 +1292,7 @@ const DataLayout &DL = F->getParent()->getDataLayout(); // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], SE, DL)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1774,63 +1763,6 @@ return getGatherCost(VecTy); } -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return nullptr; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Make sure that A and B are different pointers of the same type. - if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) - return false; - - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); - Type *Ty = cast(PtrA->getType())->getElementType(); - APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); - - APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); - - APInt OffsetDelta = OffsetB - OffsetA; - - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == Size; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the size. - APInt BaseDelta = Size - OffsetDelta; - - // Otherwise compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *C = SE->getConstant(BaseDelta); - const SCEV *X = SE->getAddExpr(PtrSCEVA, C); - return X == PtrSCEVB; -} - // Reorder commutative operations in alternate shuffle if the resulting vectors // are consecutive loads. This would allow us to vectorize the tree. // If we have something like- @@ -1858,10 +1790,10 @@ if (LoadInst *L1 = dyn_cast(Right[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + if (isConsecutiveAccess(L, L1, SE, DL) && VL1->isCommutative()) { std::swap(Left[j], Right[j]); continue; - } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + } else if (isConsecutiveAccess(L, L1, SE, DL) && VL2->isCommutative()) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -1872,10 +1804,10 @@ if (LoadInst *L1 = dyn_cast(Left[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + if (isConsecutiveAccess(L, L1, SE, DL) && VL1->isCommutative()) { std::swap(Left[j], Right[j]); continue; - } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + } else if (isConsecutiveAccess(L, L1, SE, DL) && VL2->isCommutative()) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2005,7 +1937,7 @@ for (unsigned j = 0; j < VL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, DL)) { + if (isConsecutiveAccess(L, L1, SE, DL)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2013,7 +1945,7 @@ } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, DL)) { + if (isConsecutiveAccess(L, L1, SE, DL)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -3248,7 +3180,7 @@ if (i == j) continue; const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); - if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) { + if (isConsecutiveAccess(Stores[i], Stores[j], SE, DL)) { Tails.insert(Stores[j]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[j]; Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,5 +1,5 @@ -; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -vectorize-interleaved-access=true -loop-vectorize 2>&1 | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -vectorize-interleaved-access=true -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" @@ -102,26 +102,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll @@ -0,0 +1,497 @@ +; RUN: opt -S -loop-vectorize -vectorize-interleaved-access=true -instcombine -force-vector-width=4 -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -vectorize-interleaved-access=true -instcombine -force-vector-width=4 -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2 + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; Make sure that we can vectorize the interleave access to a struct. +; struct ST2 { +; int x; +; int y; +; }; +; int test_struct_load2(struct ST2 *A) { +; int sum = 0; +; for (int i = 0; i < 1024; ++i) { +; sum += A[i].x; +; sum += A[i].y; +; } +; +; return sum; +; } + +; CHECK-LABEL: @test_struct_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> + +%struct.ST2 = type { i32, i32 } +define i32 @test_struct_load2(%struct.ST2* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %add = add nsw i32 %0, %sum + %add1 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %add1.lcssa = phi i32 [ %add1, %for.body ] + ret i32 %add1.lcssa +} + + +; Make sure that the following case can be vectorized. +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +; UF2-LABEL: @test_array_load2_store2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; }; +; int test_struct_load3(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> + +%struct.ST3 = type { i32, i32, i32 } + +define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ] + %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.015 + %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add5 +} + + +; Make sure that the following case can be vectorized. +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_array_load3_store3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> + +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +; UF2-LABEL: @test_struct_load4( +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_store4( +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; This case checks that reversed interleaved loads can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; int test_reversed_load2(struct ST2 *A) { +; int add = 0; +; int sub = 0; +; for (int i = 1023; i >= 0; i--) { +; add += A[i].x; +; sub -= A[i].y; +; } +; return add*sub; +; } + +; CHECK-LABEL: @test_reversed_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> + +define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %mul = mul nsw i32 %sub4, %add1 + ret i32 %mul + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ] + %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add1 = add nsw i32 %0, %add.014 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub4 = sub nsw i32 %sub.015, %1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; This case checks that reversed interleaved loads/stores can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +}