Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,11 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of a member of the interleaved Load/Store. + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +587,10 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +749,12 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment, + AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,20 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace) { + VectorType *VecType = dyn_cast(VecTy); + assert(VecType && "Expect a vector type"); + + // Cost for the sequential memory access for a vector + unsigned Cost = 1; + // Estimate the interleave cost to be extracting all lanes or inserting all + // lanes. + Cost += VecType->getNumElements(); + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -530,6 +530,7 @@ if (Lp != AR->getLoop()) { DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; } // The address calculation must not wrap. Otherwise, a dependence could be @@ -777,21 +778,29 @@ VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + unsigned NumIterations = ForcedFactor * ForcedUnroll; + // The number of vectorized & interleaved iterations should not be less than 2 + if (NumIterations < 2) + NumIterations = 2; + + unsigned Stride = std::abs(StrideAPtr); + // Positive distance is safe when either of: + // (1) Distance < Stride * TypeByteSize + // (2) Distance >= TypeByteSize * NumIterations * Stride + // is true and + // (3) Distance <= MaxSafeDepDistBytes + // is true. + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIterations * Stride) || + Distance > MaxSafeDepDistBytes) { DEBUG(dbgs() << "LAA: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return Dependence::Backward; } // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + if (Distance >= TypeByteSize * NumIterations * Stride) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Stride, unsigned Alignment, + unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Stride, Alignment, + AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -38,6 +38,7 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); +FunctionPass *createAArch64ShuffleVectorAndMemAccessOptPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); Index: lib/Target/AArch64/AArch64ShuffleVectorAndMemAccessOpt.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64ShuffleVectorAndMemAccessOpt.cpp @@ -0,0 +1,487 @@ +//=---------------- AArch64ShuffleVectorAndMemAccessOpt.cpp ----------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64ShuffleVectorAndMemAccessOpt pass which +// matches: +// (1) A load and an interleaved shufflevector into a ldN intrinsic +// (2) A load and several strided shufflevectors into a ldN intrinsic +// (3) An interleaved shufflevector and a store into a stN intrinsic. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/IR/Module.h" +#include "llvm/Analysis/TargetTransformInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-shuffle-access-opt" + +namespace llvm { +static void initializeAArch64ShuffleVectorAndMemAccessOptPass(PassRegistry &); +} + +namespace { +class AArch64ShuffleVectorAndMemAccessOpt : public FunctionPass { + +public: + static char ID; + AArch64ShuffleVectorAndMemAccessOpt() : FunctionPass(ID) { + initializeAArch64ShuffleVectorAndMemAccessOptPass( + *PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AArch64 ShuffleVector and Memory Access Optimization Pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + /// Iterate over the functions and promote the computation of interesting + // sext instructions. + bool runOnFunction(Function &F) override; + +private: + TargetTransformInfo *TTI; + const DataLayout *DL; + Module *M; + + // Check if the mask of the given shufflevecotor is like: + // <0, Stride, Stride*2, ..., Stride*(NumElts-1), 1, Stride+1, ...> + // E.g. For 2 interleaved vectors of <4 x i32> type, the mask is: + // <0, 2, 4, 6, 1, 3, 5, 7> + bool isInterleavedLoadMask(ShuffleVectorInst *SVI, unsigned &Stride, + unsigned &NumElts, VectorType *&VecTy); + // Match a load and a shufflevector with interleaved mask into a ldN + // intrinsic. E.G. Translate following IRs: + // %wide.vec = load <8 x i32>, <8 x i32>* %ptr + // %interleaved.vec = shufflevector <8 x i32> %wide.vec, undef, + // <0, 2, 4, 8, 1, 3, 5, 7> + // into: + // %ld2 = call {<4 x i32>, <4 x i32>} aarch64.neon.ld2(%ptr) + // %v0 = extractvalue %ld2, 0 + // %v1 = extractvalue %ld2, 1 + // %interleaved.vec = shufflevector %v0, %v1, <0, 1, 2, 3, 4, 5, 6, 7> + bool matchInterleavedLoad(ShuffleVectorInst *SVI, unsigned Stride, + unsigned NumElts, VectorType *VecTy); + + // Check if the mask of the given shufflevecotor is like: + // + // E.g. For 2 interleaved vectors of <4 x i32> type, the strided masks are: + // Index 0: <0, 2, 4, 6> + // Index 1: <1, 3, 5, 7> + bool isStridedLoadMask(ShuffleVectorInst *SVI, unsigned &Stride, + unsigned &NumElts, VectorType *&VecTy); + // Match a load and several shufflevectors with strided mask into a ldN + // intrinsic. E.G. Translate following IRs: + // %wide.vec = load <8 x i32>, <8 x i32>* %ptr + // %v0 = shufflevector %wide.vec, undef, <0, 2, 4, 8> + // %v1 = shufflevector %wide.vec, undef, <1, 3, 5, 7> + // into: + // %ld2 = call {<4 x i32>, <4 x i32>} @aarch64.neon.ld2(%ptr) + // %v0 = extractvalue %ld2, 0 + // %v1 = extractvalue %ld2, 1 + bool matchStridedLoad(ShuffleVectorInst *SVI, unsigned Stride, + unsigned NumElts, VectorType *VecTy); + + // Check if the mask of the given shufflevecotor is like: + // <0, NumElts, NumElts*2, ..., NumElts*(Stride-1), 1, NumElts+1, ...> + // E.g. For 2 interleaved vectors of <4 x i32> type, the mask is: + // <0, 4, 1, 5, 2, 6, 3, 7> + bool isInterleavedStoreMask(ShuffleVectorInst *SVI, unsigned &Stride, + unsigned &NumElts, VectorType *&VecTy); + // Match a shufflevector with interleaved mask and a store into a stN + // intrinsic. E.G. Translate following IRs: + // %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, + // <0, 4, 1, 5, 2, 6, 3, 7> + // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr + // into: + // call void @aarch64.neon.ld2(<4 x i32> %v0, <4 x i32> %v1, %ptr) + // + // Or translate following IRs: + // %interleaved.vec = shufflevector <8 x i32> %wide.vec, undef, + // <0, 4, 1, 5, 2, 6, 3, 7> + // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr + // into: + // %v0 = shufflevector <8 x i32> %wide,vec, undef, <0, 1, 2, 3> + // %v1 = shufflevector <8 x i32> %wide,vec, undef, <4, 5, 6, 7> + // call void @aarch64.neon.ld2(<4 x i32> %v0, <4 x i32> %v1, %ptr) + bool matchInterleavedStore(ShuffleVectorInst *SVI, unsigned Stride, + unsigned NumElts, VectorType *VecTy); +}; +} // end anonymous namespace. + +char AArch64ShuffleVectorAndMemAccessOpt::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64ShuffleVectorAndMemAccessOpt, DEBUG_TYPE, + "AArch64 ShuffleVector and Memory Access Opt Pass", false, + false) +INITIALIZE_PASS_END(AArch64ShuffleVectorAndMemAccessOpt, DEBUG_TYPE, + "AArch64 ShuffleVector and Memory Access Opt Pass", false, + false) + +FunctionPass *llvm::createAArch64ShuffleVectorAndMemAccessOptPass() { + return new AArch64ShuffleVectorAndMemAccessOpt(); +} + +// Get a ldN/stN intrinsic addroding to the Stride (which must be 2, 3 or 4). +static unsigned getLdNStNIntrinsicID(unsigned Stride, bool IsLoad) { + static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + + if (IsLoad) + return LoadInt[Stride - 2]; + else + return StoreInt[Stride - 2]; +} + +// Concatenate two vectors with the same type. +static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2, + unsigned NumElts) { + SmallVector Mask; + for (unsigned i = 0; i < NumElts; i++) + Mask.push_back(Builder.getInt32(i)); + + return Builder.CreateShuffleVector(V1, V2, ConstantVector::get(Mask)); +} + +// Concatenate vectors with the same type. +static Value *ConcateVectors(IRBuilder<> &Builder, + SmallVector &List) { + unsigned Size = List.size(); + assert(Size > 1 && Size < 5 && "Invalid number of vectors"); + + Value *V0 = List[0]; + Value *V1 = List[1]; + VectorType *VecTy = dyn_cast(V0->getType()); + unsigned NumElts = VecTy->getNumElements(); + if (Size == 2) + return ConcateTwoVectors(Builder, V0, V1, NumElts * 2); + + Value *V2 = List[2]; + Value *V3 = (Size == 3) ? UndefValue::get(VecTy) : List[3]; + unsigned TotalNumElts = (Size == 3) ? NumElts * 3 : NumElts * 4; + + Value *V0V1 = ConcateTwoVectors(Builder, V0, V1, NumElts * 2); + Value *V2V3 = ConcateTwoVectors(Builder, V2, V3, NumElts * 2); + return ConcateTwoVectors(Builder, V0V1, V2V3, TotalNumElts); +} + +// TODO: Could also consider about undef mask (i.e. the negative mask) +bool AArch64ShuffleVectorAndMemAccessOpt::isInterleavedLoadMask( + ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts, + VectorType *&VecTy) { + LoadInst *LI = dyn_cast(SVI->getOperand(0)); + if (!LI || !LI->isSimple() || LI->getType() != SVI->getType()) + return false; + + SmallVector Mask(SVI->getShuffleMask()); + unsigned NumTotalElts = Mask.size(); + if (NumTotalElts < 4 || Mask[0] != 0) + return false; + + Stride = static_cast(Mask[1]); + if (Stride < 2 || Stride > 4) + return false; + + if (NumTotalElts < Stride || NumTotalElts % Stride != 0) + return false; + + NumElts = NumTotalElts / Stride; + // Mask should be: 0, Stride, ..., (NumElts-1)*Stride, 1, Stride+1, ... + for (unsigned i = 0; i < Stride; i++) + for (unsigned j = 0; j < NumElts; j++) + if (static_cast(Mask[j + i * NumElts]) != j * Stride + i) + return false; + + VecTy = VectorType::get(SVI->getType()->getElementType(), NumElts); + return TTI->isTypeLegal(VecTy); +} + +// TODO: Could also consider about undef mask (i.e. the negative mask) +bool AArch64ShuffleVectorAndMemAccessOpt::isInterleavedStoreMask( + ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts, + VectorType *&VecTy) { + if (!SVI->hasOneUse()) + return false; + StoreInst *SI = dyn_cast(SVI->user_back()); + if (!SI || !SI->isSimple()) + return false; + + SmallVector Mask(SVI->getShuffleMask()); + unsigned NumTotalElts = Mask.size(); + if (NumTotalElts < 4 || Mask[0] != 0) + return false; + + NumElts = static_cast(Mask[1]); + if (NumElts < 2 || NumElts > 16) + return false; + + if (NumTotalElts < NumElts || NumTotalElts % NumElts != 0) + return false; + + Stride = NumTotalElts / NumElts; + // Mask should be: 0, NumElts, ..., (Stride-1)*NumElts, 1, NumElts+1, ... + for (unsigned i = 0; i < NumElts; i++) + for (unsigned j = 0; j < Stride; j++) + if (static_cast(Mask[j + i * Stride]) != j * NumElts + i) + return false; + + VecTy = VectorType::get(SVI->getType()->getElementType(), NumElts); + return TTI->isTypeLegal(VecTy); +} + +// TODO: Could also consider about undef mask (i.e. the negative mask) +bool AArch64ShuffleVectorAndMemAccessOpt::isStridedLoadMask( + ShuffleVectorInst *SVI, unsigned &Stride, unsigned &NumElts, + VectorType *&VecTy) { + LoadInst *LI = dyn_cast(SVI->getOperand(0)); + if (!LI || !LI->isSimple()) + return false; + + SmallVector Mask(SVI->getShuffleMask()); + NumElts = Mask.size(); + if (NumElts < 2) + return false; + + unsigned Index = static_cast(Mask[0]); + Stride = static_cast(Mask[1]) - Index; + if (Stride < 2 || Stride > 4 || Index >= Stride) + return false; + + VectorType *WideVecTy = dyn_cast(SVI->getOperand(0)->getType()); + unsigned TotalElts = WideVecTy->getVectorNumElements(); + if (TotalElts % Stride != 0 || TotalElts / Stride != NumElts) + return false; + + // The index should match: Idx, Idx+Stride, ..., Idx+(NumElts-1)*Stride + for (unsigned i = 1; i < NumElts; i++) + if (static_cast(Mask[i]) != Index + Stride * i) + return false; + + // Also need to check whether the vector type is legal. + VecTy = VectorType::get(WideVecTy->getElementType(), NumElts); + return TTI->isTypeLegal(VecTy); +} + +// Check if the given mask is strided mask and it is in the same group as the +// given Stride and NumElts. +// E.g. If Stride is 2 and NumElts is 4, the masks in the same group are: +// Index 0: <0, 2, 4, 6> +// Index 1: <1, 3, 5, 7> +static bool isStridedMaskInGroup(ArrayRef Mask, unsigned &Index, + unsigned Stride, unsigned NumElts) { + if (Mask.size() != NumElts) + return false; + + Index = static_cast(Mask[0]); + if (Index >= Stride) + return false; + + for (unsigned i = 1; i < NumElts; i++) + if (static_cast(Mask[i]) != Index + Stride * i) + return false; + return true; +} + +bool AArch64ShuffleVectorAndMemAccessOpt::matchStridedLoad( + ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts, + VectorType *VecTy) { + LoadInst *LI = dyn_cast(SVI->getOperand(0)); + + // Record the index in the interleaved load for each strided load. + SmallVector, 4> StridedList; + for (Value::user_iterator UI = LI->user_begin(), E = LI->user_end(); + UI != E;) { + ShuffleVectorInst *SV = dyn_cast(*UI++); + if (!SV) + continue; + unsigned Order; + if (isStridedMaskInGroup(SV->getShuffleMask(), Order, Stride, NumElts)) + StridedList.push_back(std::make_pair(SV, Order)); + } + + // Watch out for the pointer vector type, which can not be an argument of + // the intrinsics. Need to get integer vector first and then convert it to + // pointer vector. + Type *EltTy = VecTy->getVectorElementType(); + bool IsEltPtr = EltTy->isPointerTy(); + if (IsEltPtr) + VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts); + + Type *NewPtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {VecTy, NewPtrTy}; + Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, true); + Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys); + + IRBuilder<> Builder(LI); + Value *NewPtr = Builder.CreateBitCast(LI->getPointerOperand(), NewPtrTy); + Value *CallI = Builder.CreateCall(Callee, NewPtr, "ldN"); + unsigned Size = StridedList.size(); + // Replace each strided shuffle with the result vector of ldN. + + DEBUG(dbgs() << "Replace:\n " << *LI << "\n"); + for (unsigned i = 0; i < Size; i++) { + ShuffleVectorInst *SV = StridedList[i].first; + unsigned Index = StridedList[i].second; + Value *V = Builder.CreateExtractValue(CallI, Index); + + // Convert the integer vector to pointer vector. + if (IsEltPtr) + V = Builder.CreateIntToPtr(V, SV->getType()); + SV->replaceAllUsesWith(V); + + DEBUG(dbgs() << " " << *SV << "\n"); + // Avoid analyzing this shufflevector twice. + if (SV != SVI) + SV->eraseFromParent(); + } + DEBUG(dbgs() << "By:\n " << *CallI << "\n"); + + return true; +} + +bool AArch64ShuffleVectorAndMemAccessOpt::matchInterleavedLoad( + ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts, + VectorType *VecTy) { + LoadInst *LI = dyn_cast(SVI->getOperand(0)); + + // Watch out for the pointer vector type, which can not be an argument of + // the intrinsics. Need to get integer vector first and then convert it to + // pointer vector. + Type *EltTy = VecTy->getVectorElementType(); + bool IsEltPtr = EltTy->isPointerTy(); + if (IsEltPtr) + VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts); + + Type *NewPtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {VecTy, NewPtrTy}; + Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, true); + Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys); + IRBuilder<> Builder(LI); + Value *NewPtr = Builder.CreateBitCast(LI->getPointerOperand(), NewPtrTy); + + // Create a ldN call to do the interleaved load. + CallInst *CallI = Builder.CreateCall(Callee, NewPtr, "ldN"); + DEBUG(dbgs() << "Replace:\n " << *LI << "\n " << *SVI << "\n"); + DEBUG(dbgs() << "By:\n " << *CallI << "\n"); + + SmallVector Vectors; + for (unsigned i = 0; i < Stride; i++) + Vectors.push_back(Builder.CreateExtractValue(CallI, i)); + + // Concatenate apart vectors from the new ldN intrinsic call to a wide + // vector. + Value *WideVec = ConcateVectors(Builder, Vectors); + + // Convert the integer vector to pointer vector. + if (IsEltPtr) + WideVec = Builder.CreateIntToPtr(WideVec, SVI->getType()); + + SVI->replaceAllUsesWith(WideVec); + return true; +} + +// Get a mask consists of two parts: sequential constants, several UNDEFs. +// I.E. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Mask.push_back(Builder.getInt32(i)); + // Return directly if there is no undef value. + if (NumUndef == 0) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +bool AArch64ShuffleVectorAndMemAccessOpt::matchInterleavedStore( + ShuffleVectorInst *SVI, unsigned Stride, unsigned NumElts, + VectorType *VecTy) { + Value *V0 = SVI->getOperand(0); + VectorType *SVOpTy = dyn_cast(V0->getType()); + + // Watch out for the pointer vector type, which can not be an argument of + // the intrinsics. Need to convert pointer vector to integer vector. + Type *EltTy = VecTy->getVectorElementType(); + bool IsEltPtr = EltTy->isPointerTy(); + if (IsEltPtr) + VecTy = VectorType::get(DL->getIntPtrType(EltTy), NumElts); + + StoreInst *SI = dyn_cast(SVI->user_back()); + Value *V1 = SVI->getOperand(1); + IRBuilder<> Builder(SI); + + // Convert pointer vector to integer vector. + if (IsEltPtr) { + Type *IntVecTy = VectorType::get(DL->getIntPtrType(EltTy), + SVOpTy->getVectorNumElements()); + V0 = Builder.CreatePtrToInt(V0, IntVecTy); + V1 = Builder.CreatePtrToInt(V1, IntVecTy); + } + + SmallVector Ops; + for (unsigned i = 0; i < Stride; i++) { + Constant *Mask = getSequentialMask(Builder, NumElts * i, NumElts, 0); + Ops.push_back(Builder.CreateShuffleVector(V0, V1, Mask)); + } + + Type *NewPtrTy = VecTy->getPointerTo(SI->getPointerAddressSpace()); + Value *NewPtr = Builder.CreateBitCast(SI->getPointerOperand(), NewPtrTy); + Ops.push_back(NewPtr); + Type *Tys[2] = {VecTy, NewPtrTy}; + Intrinsic::ID IntID = (Intrinsic::ID)getLdNStNIntrinsicID(Stride, false); + Value *Callee = Intrinsic::getDeclaration(M, IntID, Tys); + + // Create a stN call do the interleaved store. + CallInst *CallI = Builder.CreateCall(Callee, Ops); + + DEBUG(dbgs() << "Replace:\n " << *SI << "\n " << *SVI << "\n"); + DEBUG(dbgs() << "By:\n " << *CallI << "\n"); + SI->eraseFromParent(); + return true; +} + +bool AArch64ShuffleVectorAndMemAccessOpt::runOnFunction(Function &F) { + DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); + TTI = &getAnalysis().getTTI(F); + M = F.getParent(); + DL = &M->getDataLayout(); + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; I++) { + ShuffleVectorInst *SVI = dyn_cast(I); + if (!SVI) + continue; + + unsigned Stride, NumElts; + VectorType *VecTy; + if (isInterleavedLoadMask(SVI, Stride, NumElts, VecTy)) + Changed |= matchInterleavedLoad(SVI, Stride, NumElts, VecTy); + else if (isStridedLoadMask(SVI, Stride, NumElts, VecTy)) + Changed |= matchStridedLoad(SVI, Stride, NumElts, VecTy); + + if (isInterleavedStoreMask(SVI, Stride, NumElts, VecTy)) + Changed |= matchInterleavedStore(SVI, Stride, NumElts, VecTy); + } + } + return Changed; +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -67,6 +67,11 @@ " to make use of cmpxchg flow-based information"), cl::init(true)); +static cl::opt AArch64ShuffleAccessOpt( + "aarch64-shuffle-access-opt", + cl::desc("Optimize shufflevector and memory access in AArch64 backend"), + cl::init(true), cl::Hidden); + static cl::opt EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), @@ -225,6 +230,9 @@ if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) addPass(createCFGSimplificationPass()); + if (TM->getOptLevel() != CodeGenOpt::None && AArch64ShuffleAccessOpt) + addPass(createAArch64ShuffleVectorAndMemAccessOptPass()); + TargetPassConfig::addIRPasses(); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,9 @@ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Stride, unsigned Alignment, + unsigned AddressSpace); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -407,6 +407,33 @@ return LT.first; } +unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, + Type *VecTy, + unsigned Stride, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *VecType = dyn_cast(VecTy); + assert(VecType && "Expect a vector type"); + + unsigned Cost = getMemoryOpCost(Opcode, VecType, Alignment, AddressSpace); + if (isTypeLegal(VecType) && Stride > 1 && Stride < 5) + return Cost; + + unsigned NumElts = VecType->getNumElements(); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Unexpected opcode for interleaved memory access"); + + // Total cost for the memory access of a wide vector + bool IsLoad = (Opcode == Instruction::Load); + // Roughly estimate the shuffle cost to be extracting each lanes or inserting + // each lanes. + for (unsigned i = 0; i < NumElts; i++) + Cost += getVectorInstrCost(IsLoad ? Instruction::ExtractElement + : Instruction::InsertElement, + VecType, i); + return Cost; +} + unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { unsigned Cost = 0; for (auto *I : Tys) { Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -38,6 +38,7 @@ AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp + AArch64ShuffleVectorAndMemAccessOpt.cpp AArch64StorePairSuppress.cpp AArch64Subtarget.cpp AArch64TargetMachine.cpp Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -134,6 +134,39 @@ "enable-mem-access-versioning", cl::init(true), cl::Hidden, cl::desc("Enable symblic stride memory access versioning")); +/// This enables vectorization on the interleaved memory accesses in code like +/// the following. +/// for (i = 0; i < N; i+=3) { +/// R = Pic[i]; // Load R color elements +/// G = Pic[i+1]; // Load G color elements +/// B = Pic[i+2]; // Load B color elements +/// ... // do something to R, G, B +/// } +/// +/// Will be translated to a wide vector load and several shufflevectors +/// %wide.vec = load <12 x i32>, <12 x i32>* %ptr ; load for R,G,B +/// %R.vec = shufflevector %wide.vec, undef, <0, 3, 6, 9> ; mask for R load +/// %G.vec = shufflevector %wide.vec, undef, <1, 4, 7, 10> ; mask for G load +/// %B.vec = shufflevector %wide.vec, undef, <2, 5, 8, 11> ; mask for B load +/// +/// for (i = 0; i < N; i+=3) { +/// ... do something to R, G, B +/// Pic[i] = R; // Store R color elements +/// Pic[i+1] = G; // Store G color elements +/// Pic[i+2] = B; // Store B color elements +/// } +/// +/// Will be translated to several shufflevectors and a wide vector store +/// %RG.vec = shufflevector %R.vec, %G.vec, <0, 1, 2, ..., 7> +/// %BU.vec = shufflevector %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +/// %interleave = shufflevector %RG.vec, %BU.vec, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; mask for interleaved store +/// store <12 x i32> %interleave, <12 x i32>* %ptr ; write for R,G,B +static cl::opt + VectorizeInterleavedAccess("vectorize-interleaved-access", cl::init(true), + cl::Hidden, + cl::desc("Vectorize interleaved memory access")); + /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; @@ -351,6 +384,8 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + void vectorizeInterleavedAccess(Instruction *InsPos, Value *Ptr, Type *VecTy); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -568,6 +603,15 @@ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} + ~LoopVectorizationLegality() { + SmallSet DelSet; + // Collecting all the pointers to avoid delete a pointer twice, + for (auto &I : InterleavedAccessMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + /// This enum represents the kinds of inductions that we support. enum InductionKind { IK_NoInduction, ///< Not an induction variable. @@ -640,6 +684,54 @@ ConstantInt *StepValue; }; + // An interleaved access list contains a group of interleaved accesses. + // E.g. for (i = 0; i < N; i+=2) { + // even = A[i]; + // odd = A[i + 1]; + // ... // operations on even/odd + // } + // The List consists of two accesses: even load, odd load. + struct InterleavedAccess { + InterleavedAccess(int Stride, bool IsLoad) + : Stride(Stride), IsLoad(IsLoad) {} + InterleavedAccess() : Stride(0) {} + + unsigned getIndex(Instruction *Access) { + assert(Members.count(Access) && + "The interleaved access doesn't contain such member"); + + for (unsigned i = 0; i < Members.size(); i++) + if (Members[i] == Access) + return i; + return Members.size(); + } + + unsigned getNumMembers() { return Members.size(); } + + bool isInsertPos(Instruction *Access) { return Access == InsertPos; } + + int Stride; + bool IsLoad; + unsigned Align; + // The interleaved access list. + SmallSetVector Members; + + // To avoid breaking the dependence, the new vectorized interleaved access + // should be inserted at the position of either: + // (1) The first load access. + // or + // (2) The last store access. + // + // E.g. %even = load i32 // Insert Position (First load) + // %add = add i32 %even // Use of %even + // %odd = load i32 // Should not insert here + // + // store i32 %even // Should not insert here + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position (Last store) + Instruction *InsertPos; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; @@ -730,6 +822,16 @@ unsigned getNumPredStores() const { return NumPredStores; } + + bool belongsToInterleavedAccess(Instruction *I) { + return InterleavedAccessMap.count(I); + } + InterleavedAccess *getInterleavedAccess(Instruction *I) { + assert(belongsToInterleavedAccess(I) && + "The given access must belong to an interleaved access"); + return InterleavedAccessMap[I]; + } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -762,7 +864,21 @@ /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAccess(Value *LoadOrStoreInst); + void collectInvarStridedAccess(Value *LoadOrStoreInst); + + /// Analyze interleaved accesses. + void analyzeInterleavedAccess(); + + /// Collect constant strided memory accesses. + /// + /// Looks for accesses like "a[i * C]" where "C" is a constant (Except +/-1). + void + collectConstStridedAccess(MapVector &StridedAccesses); + + /// Find all interleaved accesses in a queue of loads/stores with the same + /// stride and base pointer. + void collectInterleavedAccess(SmallVector &Queue, + int Stride, bool IsLoad); /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. These are handled as LoopAccessReport rather than @@ -822,6 +938,10 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + /// A map contains the relationships between the members of an interleaved + /// access and the interleaved access. + DenseMap InterleavedAccessMap; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1657,6 +1777,196 @@ "reverse"); } +// Get the mask to interleaved a wide vector concatenated from several small +// vectors like: V0, V1, V2, ... +// The mask is: +// +// which is equal to <0, VF, VF*2, ..., VF*NumVec-VF, 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors of <4 x i32> type, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting: +// +// This can be used to interleave a wide vector into several small vectors. +// E.g. To interleaved a wide vector of <8 x i32> type: +// %v0 = shufflevector <8 x i32> %wide.vec, undef, <0, 2, 4, 6> +// %v1 = shufflevector <8 x i32> %wide.vec, undef, <1, 3, 5, 7> +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consist of sequential constants, +// The second part consist of UNDEFs. +// I.E. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Mask.push_back(Builder.getInt32(i)); + // Return directly if there is no undef value. + if (NumUndef == 0) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 1st vector always +// has the same elements as the 2nd vector or more elements. +// If the 1st vector has more elements, extend the 2nd vector with UNDEFs. +static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + if (NumElts1 == NumElts2) { + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 * 2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); + } + + assert(NumElts1 > NumElts2 && "Expect the first vector has more elements"); + // Extend V2 to with undef elements. + Constant *ExtMask = + getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2); + Value *Undef = UndefValue::get(VecTy2); + Value *ExtV = Builder.CreateShuffleVector(V2, Undef, ExtMask); + + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, ExtV, Mask); +} + +// Concatenate number of vectors in the give list starting from Idx +static Value *ConcateVectors(IRBuilder<> &Builder, + SmallVector &List, unsigned Idx, + unsigned NumVec) { + assert(NumVec != 0 && "Number of vectors should not be zero"); + if (NumVec == 1) + return List[Idx]; + + unsigned PartNum = PowerOf2Floor(NumVec); + // When the number of vectors is not power of 2, split the list and make sure + // the first list has power of 2 elements. + // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements. + if (PartNum == NumVec) + PartNum /= 2; + Value *V0 = ConcateVectors(Builder, List, Idx, PartNum); + unsigned NextIdx = Idx + PartNum; + Value *V1 = ConcateVectors(Builder, List, NextIdx, NumVec - PartNum); + return ConcateTwoVectors(Builder, V0, V1); +} + +// Create the vectorized instruction for interleaved access. +// E.g. Transform interleaved load with 2 members: +// %s1 = load i32, i32* %ptr1 +// %s2 = load i32, i32* %ptr2 +// To +// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +// %v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> +// %v2 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> +// +// Transform interleaved store with 3 members: +// store i32 %s1, i32 %ptr1 +// store i32 %s2, i32 %ptr2 +// store i32 %s3, i32 %ptr3 +// To +// %v1_v2 = shufflevector <4 x i32> %v1, %v2, <0, 1, 2, 3, 4, 5, 6, 7> +// %v3_und = shufflevector <4 x i32> %v3, undef, <0, 1, 2, 3, 4, 5, 6, 7> +// %interleaved.vec = shufflevector %v1_v2, %v3_und, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +// store <12 x i32> %interleaved.vec, <12 x i32> %ptr +void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr, + Value *Ptr, Type *DataTy) { + VectorType *VecTy = dyn_cast(DataTy); + assert(VecTy && "expect a vector type"); + + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + assert(IA->InsertPos == Instr && "Must be the insert position"); + + unsigned Idx = IA->getIndex(Instr); + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + SmallSetVector &Members = IA->Members; + unsigned Align = IA->Align; + unsigned NumVec = Members.size(); + + Type *EleTy = VecTy->getElementType(); + Type *WideVecTy = VectorType::get(EleTy, NumVec * VF); + Type *WidePtrTy = WideVecTy->getPointerTo(AddressSpace); + + Constant *Zero = Builder.getInt32(0); + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + for (unsigned Part = 0; Part < UF; Part++) { + // Adjust pointer to the access of index 0. + Constant *Offset = Builder.getInt32(-static_cast(Idx)); + Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero); + NewPtr = Builder.CreateGEP(NewPtr, Offset); + + // Bit cast to the new wide vector pointer type. + NewPtr = Builder.CreateBitCast(NewPtr, WidePtrTy); + NewPtrs.push_back(NewPtr); + } + + setDebugLocFromInst(Builder, Instr); + if (IA->IsLoad) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *WideLoad = + Builder.CreateAlignedLoad(NewPtrs[Part], Align, false, "wide.vec"); + + // Interleave and split the loaded wide vector into small vectors. + Value *Undef = UndefValue::get(WideVecTy); + for (unsigned i = 0; i < NumVec; i++) { + VectorParts &Entry = WidenMap.get(Members[i]); + Constant *StridedMask = getStridedMask(Builder, i, NumVec, VF); + Entry[Part] = Builder.CreateShuffleVector(WideLoad, Undef, StridedMask, + "strided.vec"); + } + + propagateMetadata(WideLoad, Instr); + } + return; + } + + for (unsigned Part = 0; Part < UF; Part++) { + SmallVector StoredVal; + for (unsigned i = 0; i < NumVec; i++) + StoredVal.push_back(getVectorValue(Members[i]->getOperand(0))[Part]); + // Concatenate the apart vectors into a wide vector. + Value *WideVec = ConcateVectors(Builder, StoredVal, 0, NumVec); + + Constant *IMask = getInterleavedStoreMask(Builder, VF, NumVec); + // Interleave the data in the wide vector. + Value *IVec = Builder.CreateShuffleVector( + WideVec, UndefValue::get(WideVecTy), IMask, "interleaved.vec"); + Instruction *WideStore = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Align, false); + propagateMetadata(WideStore, Instr); + } + return; +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1684,6 +1994,15 @@ if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); + if (Legal->belongsToInterleavedAccess(Instr)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(Instr); + // Skip the access if it is not the insert position. + if (!IA->isInsertPos(Instr)) + return; + return vectorizeInterleavedAccess(Instr, Ptr, DataTy); + } + // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -1780,7 +2099,7 @@ if (Legal->isMaskRequired(SI)) NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, Mask[Part]); - else + else NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } @@ -3414,6 +3733,344 @@ return true; } +static bool isInBoundsGep(Value *Ptr) { + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + return GEP->isInBounds(); + return false; +} + +/// \brief Check whether the access through \p Ptr has a constant stride. +static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const SCEV *&PtrScev) { + const Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Unexpected non-ptr"); + const PointerType *PtrTy = cast(Ty); + + PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PtrScev); + // The pointer must be an AddRecExpr pointer and the accesss function must + // stride over the innermost loop. + if (!AR || Lp != AR->getLoop()) + return 0; + + // The address calculation must not wrap. Otherwise, a dependence could be + // inverted. + // An inbounds getelementptr that is a AddRec with a unit stride + // cannot wrap per definition. The unit stride requirement is checked later. + // An getelementptr without an inbounds attribute and unit stride would have + // to access the pointer value "0" which is undefined behavior in address + // space 0, therefore we can also vectorize this case. + bool IsInBoundsGEP = isInBoundsGep(Ptr); + bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); + bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; + if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) + return 0; + + // Check the step is constant. + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) + return 0; + + auto &DL = Lp->getHeader()->getModule()->getDataLayout(); + int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return 0; + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) + return 0; + + // If the SCEV could wrap but we have an inbounds gep with a unit stride we + // know we can't "wrap around the address space". In case of address space + // zero we know that this won't happen without triggering undefined behavior. + if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && + Stride != 1 && Stride != -1) + return 0; + + return Stride; +} + +static unsigned getAddressSpaceOperand(Instruction *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +static Value *getPointerOperand(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +static Value *getBasePointer(Instruction *I, const DataLayout &DL) { + Value *Ptr = getPointerOperand(I); + assert(Ptr && "Fail to get point from a memory instruction"); + return GetUnderlyingObject(Ptr, DL); +} + +static bool isConsecutiveAccess(Instruction *A, Instruction *B, + ScalarEvolution *SE, const DataLayout &DL) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + APInt OffsetDelta = OffsetB - OffsetA; + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(BaseDelta); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +void LoopVectorizationLegality::collectConstStridedAccess( + MapVector &StridedAccesses) { + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Value *Ptr = getPointerOperand(it); + if (!Ptr) + continue; + + const SCEV *PtrScev; + int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev); + if (Stride < 2 && Stride > -2) + continue; + + DEBUG(dbgs() << "LV: Found a constant strided access\n"); + DEBUG(dbgs() << " pointer:" << *Ptr << "\n"); + DEBUG(dbgs() << " Stride:" << Stride << "\n"); + StridedAccesses[it] = Stride; + } + } +} + +void LoopVectorizationLegality::collectInterleavedAccess( + SmallVector &Queue, int Stride, bool IsLoad) { + if (Queue.size() < 2) { + Queue.clear(); + return; + } + + SetVector Heads; + SetVector Tails; + SmallDenseMap ConsecutiveChain; + + // Collect all the consecutive pairs. + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (unsigned i = 0, e = Queue.size(); i != e; i++) { + for (unsigned j = 0; j != e; j++) { + if (i == j) + continue; + if (isConsecutiveAccess(Queue[i], Queue[j], SE, DL)) { + Heads.insert(Queue[i]); + Tails.insert(Queue[j]); + ConsecutiveChain[Queue[i]] = Queue[j]; + } + } + } + + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + SmallSetVector ConsecutiveList; + Instruction *I = *it; + + // Collect a list with consecutive accesses. + while (Tails.count(I) || Heads.count(I)) { + ConsecutiveList.insert(I); + // Move to the next access in the chain. + I = ConsecutiveChain[I]; + } + + unsigned Size = ConsecutiveList.size(); + unsigned Num = std::abs(Stride); + if (Size < Num) + continue; + + // As every interleave access consists of the same number of members as the + // stride. There may be more than 1 interleaved access in this list. + for (unsigned Part = 0; Part < Size / Num; Part++) { + InterleavedAccess *ILAccess = new InterleavedAccess(Stride, IsLoad); + SmallSetVector &Members = ILAccess->Members; + unsigned MaxAlign = 1; + for (unsigned i = Part * Num; i < Part * Num + Num; i++) { + Instruction *Access = ConsecutiveList[i]; + // Insert each member. + Members.insert(Access); + // Record the relationship of the member and the interleaved access + InterleavedAccessMap[Access] = ILAccess; + + unsigned Align = IsLoad ? dyn_cast(Access)->getAlignment() + : dyn_cast(Access)->getAlignment(); + if (Align > MaxAlign) + MaxAlign = Align; + } + // The alignment of the interleaved access is the max member alignment. + ILAccess->Align = MaxAlign; + + // To avoid breaking the dependences, the new vectorized interleaved + // access has different insert positions for load and store. + if (IsLoad) { + for (unsigned i = 0, e = Queue.size(); i != e; i++) { + // Start from the first access in the queue. If an access is in the + // current interleaved access, it is the first access and set it to be + // the insert position. + if (Members.count(Queue[i])) { + ILAccess->InsertPos = Queue[i]; + break; + } + } + } else { + for (int i = Queue.size() - 1, e = -1; i != e; i--) { + // Start from the last access in the queue. If an access is in the + // current interleaved access, it is the last access and set it to be + // the insert position. + if (Members.count(Queue[i])) { + ILAccess->InsertPos = Queue[i]; + break; + } + } + } + + DEBUG(dbgs() << "LV: Found interleaved accesses with Stride " << Stride + << "\n"); + } + } + + return; +} + +void LoopVectorizationLegality::analyzeInterleavedAccess() { + MapVector StridedAccesses; + collectConstStridedAccess(StridedAccesses); + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + + // The following logic groups the constant strided accesses and find the + // interleaved accesses in two steps: + // (1) Groups loads/stores with the same stride and base pointers into a + // queue. + // (2) Calls another function to collect all interleaved accesses in that + // queue. + // TODO: It can't collect interleaved access if the loads/stores are mixed. + // E.g. for (i = 0; i < N; i+=2) { + // A[i] += 1; + // A[i+1] += 2; + // } + // It will fail to identify the interleaved accesses as the mixed order: + // load A[i], store A[i], load A[i+1], store A[i+1]. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + // Queue for the loads/stores of the same stride and base pointer. + SmallVector AccessQueue; + bool TryMerge = false; + Value *CurBase = nullptr; + int CurStride; + bool IsLoad = false; + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + if (TryMerge) { + collectInterleavedAccess(AccessQueue, CurStride, IsLoad); + CurBase = nullptr; + TryMerge = false; + } + + if (!it->mayReadOrWriteMemory()) + continue; + + // A non-strided memory access may break the dependence. Merge the queue. + if (!StridedAccesses.count(it)) { + TryMerge = true; + continue; + } + + Value *Base = getBasePointer(it, DL); + int Stride = StridedAccesses[it]; + bool Read = it->mayReadFromMemory(); + + // Start a new queue + if (CurBase == nullptr) { + AccessQueue.clear(); + AccessQueue.push_back(it); + CurBase = Base; + CurStride = Stride; + IsLoad = Read; + continue; + } + + // This instructure doesn't fit the current queue. Try to merge current + // queue and insert it again. + // TODO: this may missing opportunities as it hasn't collected all + // loads/stores with the same base pointer and stride. + if (CurBase != Base || CurStride != Stride || IsLoad != Read) { + TryMerge = true; + --it; + continue; + } + + AccessQueue.push_back(it); + } + + // Collect the tail accesses. + if (CurBase != nullptr) { + collectInterleavedAccess(AccessQueue, CurStride, IsLoad); + CurBase = nullptr; + TryMerge = false; + } + } +} + static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -3601,12 +4258,12 @@ return false; } if (EnableMemAccessVersioning) - collectStridedAccess(ST); + collectInvarStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast(it)) - collectStridedAccess(LI); + collectInvarStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. @@ -3629,6 +4286,9 @@ } } + if (VectorizeInterleavedAccess) + analyzeInterleavedAccess(); + return true; } @@ -3744,7 +4404,7 @@ return Stride; } -void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { +void LoopVectorizationLegality::collectInvarStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) Ptr = LI->getPointerOperand(); @@ -3757,7 +4417,7 @@ if (!Stride) return; - DEBUG(dbgs() << "LV: Found a strided access that we can version"); + DEBUG(dbgs() << "LV: Found an invariable strided access that we can version"); DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n"); Strides[Ptr] = Stride; StrideSet.insert(Stride); @@ -4575,6 +5235,14 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // Interleaved memory loads/stores + if (Legal->belongsToInterleavedAccess(I)) { + LoopVectorizationLegality::InterleavedAccess *IA = + Legal->getInterleavedAccess(I); + return TTI.getInterleavedMemoryOpCost(I->getOpcode(), VectorTy, + IA->getNumMembers(), IA->Align, AS); + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: test/CodeGen/AArch64/shuffle-access-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/shuffle-access-opt.ll @@ -0,0 +1,175 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; CHECK-LABEL: test_load_stride2_1Mask: +; CHECK: ld2 { v0.4s, v1.4s }, [x0] +define <4 x i32> @test_load_stride2_1Mask(i32* %ptr) { + %base = bitcast i32* %ptr to <8 x i32>* + %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 + %interleaved.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <8 x i32> + %strided.v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> + %strided.v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v0, %strided.v1 + ret <4 x i32> %add +} + +; CHECK-LABEL: test_load_stride2_2Masks: +; CHECK: ld2 { v0.4s, v1.4s }, [x0] +define <4 x i32> @test_load_stride2_2Masks(i32* %ptr) { + %base = bitcast i32* %ptr to <8 x i32>* + %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4 + %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> + %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v0, %strided.v1 + ret <4 x i32> %add +} + +; CHECK-LABEL: test_load_stride3_1Mask: +; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +define <4 x i32> @test_load_stride3_1Mask(i32* %ptr) { + %base = bitcast i32* %ptr to <12 x i32>* + %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 + %interleaved.vec = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <12 x i32> + %strided.v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> + %strided.v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> + %strided.v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v1, %strided.v2 + %add1 = add nsw <4 x i32> %add, %strided.v0 + ret <4 x i32> %add1 +} + +; CHECK-LABEL: test_load_stride3_2Masks: +; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0] +define <4 x i32> @test_load_stride3_2Masks(i32* %ptr) { + %base = bitcast i32* %ptr to <12 x i32>* + %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4 + %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> + %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v2, %strided.v1 + ret <4 x i32> %add +} + +; CHECK-LABEL: test_load_stride4_1Mask: +; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +define <4 x i32> @test_load_stride4_1Mask(i32* %ptr) { + %base = bitcast i32* %ptr to <16 x i32>* + %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 + %interleaved.vec = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <16 x i32> + %strided.v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> + %strided.v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> + %strided.v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> + %strided.v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v0, %strided.v3 + %add1 = add nsw <4 x i32> %strided.v1, %strided.v2 + %add2 = add nsw <4 x i32> %add, %add1 + ret <4 x i32> %add2 +} + +; CHECK-LABEL: test_load_stride4_3Masks: +; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +define <4 x i32> @test_load_stride4_3Masks(i32* %ptr) { + %base = bitcast i32* %ptr to <16 x i32>* + %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4 + %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> + %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> + %add = add nsw <4 x i32> %strided.v0, %strided.v2 + ret <4 x i32> %add +} + +; CHECK-LABEL: test_store_stride2: +; CHECK: st2 { v0.4s, v1.4s }, [x0] +define void @test_store_stride2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) { + %base = bitcast i32* %ptr to <8 x i32>* + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4 + ret void +} + +; CHECK-LABEL: test_store_stride3: +; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0] +define void @test_store_stride3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { + %base = bitcast i32* %ptr to <12 x i32>* + %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> + store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4 + ret void +} + +; CHECK-LABEL: test_store_stride4: +; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0] +define void @test_store_stride4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { + %base = bitcast i32* %ptr to <16 x i32>* + %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> + store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 + ret void +} + +; The following cases test that vectors of pointer elements can be well translated. +; CHECK-LABEL: test_load_stride2_ptrvec: +; CHECK: ld2 { v0.2d, v1.2d }, [x0] +define void @test_load_stride2_ptrvec(i32** %ptr, <4 x i32*>* %ptr1) { + %base = bitcast i32** %ptr to <4 x i32*>* + %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4 + %interleaved.vec = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <4 x i32> + store <4 x i32*> %interleaved.vec, <4 x i32*>* %ptr1 + ret void +} + +; CHECK-LABEL: test_load_stride3_ptrvec: +; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0] +define void @test_load_stride3_ptrvec(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) { + %base = bitcast i32** %ptr to <6 x i32*>* + %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4 + %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> + store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1 + %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> + store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2 + ret void +} + +; CHECK-LABEL: test_load_stride4_ptrvec: +; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +define void @test_load_stride4_ptrvec(i32** %ptr, <8 x i32*>* %ptr1) { + %base = bitcast i32** %ptr to <8 x i32*>* + %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4 + %interleaved.vec = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <8 x i32> + store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr1 + ret void +} + +; CHECK-LABEL: test_store_stride2_ptrvec: +; CHECK: st2 { v0.2d, v1.2d }, [x0] +define void @test_store_stride2_ptrvec(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) { + %base = bitcast i32** %ptr to <4 x i32*>* + %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> + store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4 + ret void +} + +; CHECK-LABEL: test_store_stride3_ptrvec: +; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0] +define void @test_store_stride3_ptrvec(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) { + %base = bitcast i32** %ptr to <6 x i32*>* + %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> + %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> + %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> + store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4 + ret void +} + +; CHECK-LABEL: test_store_stride4_ptrvec: +; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] +define void @test_store_stride4_ptrvec(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) { + %base = bitcast i32* %ptr to <8 x i32*>* + %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> + %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> + %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> + store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4 + ret void +} + Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,5 +1,5 @@ -; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 2>&1 | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=VF2 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" @@ -20,12 +20,12 @@ ; CHECK: %index.next = add i64 %index, 8 ; CHECK: icmp eq i64 %index.next, 512 -; FORCE-VEC-LABEL: @ind_plus2( -; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>* -; FORCE-VEC: mul nsw <2 x i32> -; FORCE-VEC: add nsw <2 x i32> -; FORCE-VEC: %index.next = add i64 %index, 2 -; FORCE-VEC: icmp eq i64 %index.next, 512 +; VF2-LABEL: @ind_plus2( +; VF2: %wide.load = load <2 x i32>, <2 x i32>* +; VF2: mul nsw <2 x i32> +; VF2: add nsw <2 x i32> +; VF2: %index.next = add i64 %index, 2 +; VF2: icmp eq i64 %index.next, 512 define i32 @ind_plus2(i32* %A) { entry: br label %for.body @@ -64,12 +64,12 @@ ; CHECK: %index.next = add i64 %index, 8 ; CHECK: icmp eq i64 %index.next, 512 -; FORCE-VEC-LABEL: @ind_minus2( -; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>* -; FORCE-VEC: mul nsw <2 x i32> -; FORCE-VEC: add nsw <2 x i32> -; FORCE-VEC: %index.next = add i64 %index, 2 -; FORCE-VEC: icmp eq i64 %index.next, 512 +; VF2-LABEL: @ind_minus2( +; VF2: %wide.load = load <2 x i32>, <2 x i32>* +; VF2: mul nsw <2 x i32> +; VF2: add nsw <2 x i32> +; VF2: %index.next = add i64 %index, 2 +; VF2: icmp eq i64 %index.next, 512 define i32 @ind_minus2(i32* %A) { entry: br label %for.body @@ -102,30 +102,27 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 - -; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: mul nsw <2 x i32> -; FORCE-VEC: add nsw <2 x i32> -; FORCE-VEC: %index.next = add i64 %index, 2 -; FORCE-VEC: icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 + +; VF2-LABEL: @ptr_ind_plus2( +; VF2: %[[V:.*]] = load <4 x i32> +; VF2: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; VF2: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; VF2: mul nsw <2 x i32> +; VF2: add nsw <2 x i32> +; VF2: %index.next = add i64 %index, 2 +; VF2: icmp eq i64 %index.next, 1024 define i32 @ptr_ind_plus2(i32* %A) { entry: br label %for.body Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll @@ -0,0 +1,398 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UF2 + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnueabi" + +; Make sure that we can vectorize the interleave access to a struct. +; struct ST2 { +; int x; +; int y; +; }; +; +; int test_struct_load2(struct ST2 *A, int n) { +; int sum = 0; +; for (int i = 0; i < 1024; ++i) { +; sum += A[i].x; +; sum += A[i].y; +; } +; +; return sum; +; } + +; CHECK-LABEL: @test_struct_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> + +%struct.ST2 = type { i32, i32 } +define i32 @test_struct_load2(%struct.ST2* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %add = add nsw i32 %0, %sum + %add1 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %add1.lcssa = phi i32 [ %add1, %for.body ] + ret i32 %add1.lcssa +} + + +; Make sure that the following case can be vectorized. +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +; UF2-LABEL: @test_array_load2_store2( +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: load <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> +; UF2: shufflevector <4 x i32> +; UF2: store <8 x i32> + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; }; +; int test_struct_load3(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> + +; UF2-LABEL: @test_struct_load3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> + +%struct.ST3 = type { i32, i32, i32 } + +define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ] + %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.015 + %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add5 +} + + +; Make sure that the following case can be vectorized. +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_array_load3_store3( +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: load <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <12 x i32> + +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +; UF2-LABEL: @test_struct_load4( +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: load <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> +; UF2: shufflevector <16 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +; UF2-LABEL: @test_struct_store4( +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <4 x i32> +; UF2: shufflevector <8 x i32> +; UF2: store <16 x i32> + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}