Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -300,6 +300,10 @@ bool isLegalMaskedStore(Type *DataType, int Consecutive) const; bool isLegalMaskedLoad(Type *DataType, int Consecutive) const; + bool supportInterleaveAccess() const; + bool isLegalInterleaveType(Type *VecType) const; + bool isLegalInterleaveNum(unsigned NumVec) const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -432,6 +436,11 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of interleaved Load and Store instructions. + unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -529,6 +538,9 @@ int64_t Scale) = 0; virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0; virtual bool isLegalMaskedLoad(Type *DataType, int Consecutive) = 0; + virtual bool supportInterleaveAccess() = 0; + virtual bool isLegalInterleaveType(Type *VecType) = 0; + virtual bool isLegalInterleaveNum(unsigned NumVec) = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) = 0; @@ -569,6 +581,9 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -642,6 +657,15 @@ bool isLegalMaskedLoad(Type *DataType, int Consecutive) override { return Impl.isLegalMaskedLoad(DataType, Consecutive); } + bool supportInterleaveAccess() override { + return Impl.supportInterleaveAccess(); + } + bool isLegalInterleaveType(Type *VecType) override { + return Impl.isLegalInterleaveType(VecType); + } + bool isLegalInterleaveNum(unsigned NumVec) override { + return Impl.isLegalInterleaveNum(NumVec); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) override { return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale); @@ -724,6 +748,11 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleaveMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -215,6 +215,12 @@ bool isLegalMaskedLoad(Type *DataType, int Consecutive) { return false; } + bool supportInterleaveAccess() { return false; } + + bool isLegalInterleaveType(Type *VecType) { return false; } + + bool isLegalInterleaveNum(unsigned NumVec) { return false; } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Guess that all legal addressing mode are free. @@ -298,6 +304,12 @@ return 1; } + unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/IR/IRBuilder.h =================================================================== --- include/llvm/IR/IRBuilder.h +++ include/llvm/IR/IRBuilder.h @@ -435,6 +435,12 @@ CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align, Value *Mask); + CallInst *CreateIndexLoad(Type *RetValTy, Value *Ptr, Constant *Indices, + unsigned Align, const Twine &Name = ""); + + CallInst *CreateIndexStore(Value *VecVal, Value *Ptr, Constant *Indices, + unsigned Align); + /// \brief Create an assume intrinsic call that allows the optimizer to /// assume that the provided condition will be true. CallInst *CreateAssumption(Value *Cond); Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -612,6 +612,21 @@ LLVMVectorSameWidth<0, llvm_i1_ty>], [IntrReadWriteArgMem]>; +//===--------------------- Index load/store Intrinsics --------------------===// +// +// The index load +def int_index_load : Intrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, + LLVMVectorSameWidth<0, llvm_i32_ty>, + llvm_i32_ty], + [IntrReadArgMem]>; +// The index store +def int_index_store : Intrinsic<[], + [llvm_anyvector_ty, llvm_ptr_ty, + LLVMVectorSameWidth<0, llvm_i32_ty>, + llvm_i32_ty], + [IntrReadWriteArgMem]>; + // Intrinsics to support bit sets. def int_bitset_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty], [IntrNoMem]>; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -514,6 +514,7 @@ if (Lp != AR->getLoop()) { DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << *Ptr << " SCEV: " << *PtrScev << "\n"); + return 0; } // The address calculation must not wrap. Otherwise, a dependence could be @@ -761,21 +762,29 @@ VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + unsigned NumIterations = ForcedFactor * ForcedUnroll; + // The number of vectorized & interleaved iterations should not be less than 2 + if (NumIterations < 2) + NumIterations = 2; + + unsigned Stride = std::abs(StrideAPtr); + // Positive distance is safe when either of: + // (1) Distance < Stride * TypeByteSize + // (2) Distance >= TypeByteSize * NumIterations * Stride + // is true and + // (3) Distance <= MaxSafeDepDistBytes + // is true. + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIterations * Stride) || + Distance > MaxSafeDepDistBytes) { DEBUG(dbgs() << "LAA: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return Dependence::Backward; } // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + if (Distance > Stride * TypeByteSize) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -111,6 +111,18 @@ return TTIImpl->isLegalMaskedLoad(DataType, Consecutive); } +bool TargetTransformInfo::supportInterleaveAccess() const { + return TTIImpl->supportInterleaveAccess(); +} + +bool TargetTransformInfo::isLegalInterleaveType(Type *VecType) const { + return TTIImpl->isLegalInterleaveType(VecType); +} + +bool TargetTransformInfo::isLegalInterleaveNum(unsigned NumVec) const { + return TTIImpl->isLegalInterleaveNum(NumVec); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, @@ -232,6 +244,14 @@ } unsigned +TargetTransformInfo::getInterleaveMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + return TTIImpl->getInterleaveMemoryOpCost(Opcode, Src, Alignment, + AddressSpace); +} + +unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { return TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys); Index: lib/IR/IRBuilder.cpp =================================================================== --- lib/IR/IRBuilder.cpp +++ lib/IR/IRBuilder.cpp @@ -230,6 +230,31 @@ return createCallHelper(TheFn, Ops, this, Name); } +CallInst *IRBuilderBase::CreateIndexLoad(Type *RetValTy, Value *Ptr, + Constant *Indices, unsigned Align, + const Twine &Name) { + assert(isa(RetValTy) && "expected a vector value"); + assert(isa(Ptr->getType()) && "expected a pointer value"); + + Value *Ops[] = {Ptr, Indices, getInt32(Align)}; + Module *M = BB->getParent()->getParent(); + Intrinsic::ID Id = Intrinsic::index_load; + Value *TheFn = Intrinsic::getDeclaration(M, Id, RetValTy); + return createCallHelper(TheFn, Ops, this, Name); +} + +CallInst *IRBuilderBase::CreateIndexStore(Value *VecVal, Value *Ptr, + Constant *Indices, unsigned Align) { + assert(isa(VecVal->getType()) && "expected a vector value"); + assert(isa(Ptr->getType()) && "expected a pointer value"); + + Value *Ops[] = {VecVal, Ptr, Indices, getInt32(Align)}; + Module *M = BB->getParent()->getParent(); + Intrinsic::ID Id = Intrinsic::index_store; + Value *TheFn = Intrinsic::getDeclaration(M, Id, VecVal->getType()); + return createCallHelper(TheFn, Ops, this); +} + CallInst *IRBuilderBase::CreateGCStatepoint(Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -38,6 +38,7 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); +FunctionPass *createAArch64InterleaveAccessPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); Index: lib/Target/AArch64/AArch64InterleaveAccess.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64InterleaveAccess.cpp @@ -0,0 +1,286 @@ +//=---------------------- AArch64InterleaveAccess.cpp ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64InterleaveAccess pass which matches +// interleave access intrinsics to the AArch64 ldN/stN intrinsics. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/IR/Module.h" +#include "llvm/Analysis/TargetTransformInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-interleave-access" + +namespace llvm { +void initializeAArch64InterleaveAccessPass(PassRegistry &); +} + +namespace { +class AArch64InterleaveAccess : public FunctionPass { + +public: + static char ID; + AArch64InterleaveAccess() : FunctionPass(ID) { + initializeAArch64InterleaveAccessPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AArch64 Interleave Access"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + /// Iterate over the functions and promote the computation of interesting + // sext instructions. + bool runOnFunction(Function &F) override; + +private: + TargetTransformInfo *TTI; + const DataLayout *DL; + Module *M; + + bool castInterleaveIntrinsics(Function::iterator B); + bool isInterleaveIndices(Value *IdxOp, unsigned &NumVec, unsigned &NumElts); +}; +} // end anonymous namespace. + +char AArch64InterleaveAccess::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64InterleaveAccess, "aarch64-interleave-access", + "AArch64 Interleave Access Pass", false, false) +INITIALIZE_PASS_END(AArch64InterleaveAccess, "aarch64-interleave-access", + "AArch64 Interleave Access Pass", false, false) + +FunctionPass *llvm::createAArch64InterleaveAccessPass() { + return new AArch64InterleaveAccess(); +} + +static unsigned getIntrinsicID(unsigned Num, bool IsLoad) { + static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + + if (IsLoad) + return LoadInt[Num - 2]; + else + return StoreInt[Num - 2]; +} + +// Return true if the given indices list is interleaved, also return the number +// of vectors and the number of vector elements. +// E.g. <0, 2, 4, 8, 1, 3, 5, 7>. NumVec is 2 and NumElts is 4. +// <0, 3, ..., 9, 1, 4, ..., 10, 2, 5, ..., 11>. NumVec is 3 and NumElts +// is 4. +bool AArch64InterleaveAccess::isInterleaveIndices(Value *IdxOp, + unsigned &NumVec, + unsigned &NumElts) { + const Constant *CIdx = dyn_cast(IdxOp); + assert(CIdx && (isa(CIdx) || isa(CIdx)) && + "expect indices to be constant vector"); + + unsigned NumOp = CIdx->getType()->getVectorNumElements(); + if (NumOp <= 2) + return false; + + Constant *COp0 = CIdx->getAggregateElement(0U); + Constant *COp1 = CIdx->getAggregateElement(1); + auto *CI0 = dyn_cast(COp0); + auto *CI1 = dyn_cast(COp1); + if (!CI0 || !CI1 || !CI0->isZero()) + return false; + + // Interleave Stride. + NumVec = CI1->getZExtValue(); + if (!TTI->isLegalInterleaveNum(NumVec)) + return false; + + NumElts = NumOp / NumVec; + // The indices should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ... + for (unsigned i = 0; i < NumVec; i++) { + for (unsigned j = 0; j < NumElts; j++) { + Constant *COp = CIdx->getAggregateElement(j + i * NumElts); + auto *CI = dyn_cast(COp); + if (!CI || CI->getZExtValue() != j * NumVec + i) + return false; + } + } + + return true; +} + +// Get a indices vector starts with sequential constant and ends with UNDEFs: +// Idx, Idx + 1, Idx + 2, ..., Idx + Num - 1, undef, undef, ..., undef. +static Constant *getSequentialIndices(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Indices; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Indices.push_back(Builder.getInt32(i)); + if (NumUndef == 0) + return ConstantVector::get(Indices); + + Constant *UndefVal = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Indices.push_back(UndefVal); + return ConstantVector::get(Indices); +} + +// Concatenate two vectors with the same element type. The first vector has +// not less elements as the second vector. +static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + if (NumElts1 == NumElts2) { + Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 * 2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); + } + + assert(NumElts1 > NumElts2 && "Expect the first vector has more elements"); + // Extend V2 to be the same vector type as V1 with undef elements. + Constant *ExtMask = + getSequentialIndices(Builder, 0, NumElts2, NumElts1 - NumElts2); + Value *UndefV = UndefValue::get(VecTy2); + Value *ExtV2 = Builder.CreateShuffleVector(V2, UndefV, ExtMask); + + Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, ExtV2, Mask); +} + +// Concatenate NumVec of vectors in VecList start at Idx +static Value *ConcateVectors(IRBuilder<> &Builder, + SmallVector &VecList, unsigned Idx, + unsigned NumVec) { + assert(NumVec != 0 && "Number of vectors should not be zero"); + if (NumVec == 1) + return VecList[Idx]; + + unsigned NumPart = PowerOf2Floor(NumVec); + // When the number of vectors is not power of 2, split the list and make sure + // the first list has power of 2 elements. + // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements. + if (NumPart == NumVec) + NumPart /= 2; + Value *V0 = ConcateVectors(Builder, VecList, Idx, NumPart); + unsigned NextIdx = Idx + NumPart; + Value *V1 = ConcateVectors(Builder, VecList, NextIdx, NumVec - NumPart); + return ConcateTwoVectors(Builder, V0, V1); +} + +// Match index load/store intrinsic to the AArch64 ldN/stN intrinsic. +// E.g. %idxvec = call @index.load(i8 *%ptr, <0, 2, 4, 6, 1, 3, 5, 7>) +// To +// %ldN = call {<4 x i32>, <4 x i32>} @llvm.aarch64.ld2(%newptr) +// %V0 = extract {<4 x i32>, <4 x i32>} %ldN, 0 +// %V1 = extract {<4 x i32>, <4 x i32>} %ldN, 1 +// %idxvec = shufflevector %V0, %V1, <8 x i32> <0, 1, 2, 3, 4, 5, 6, 7> +bool AArch64InterleaveAccess::castInterleaveIntrinsics(Function::iterator BB) { + bool Changed = false; + IRBuilder<> Builder(BB); + for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE;) { + CallInst *CI = dyn_cast(I++); + if (!CI || !isa(CI)) + continue; + + unsigned Intrinsic = CI->getCalledFunction()->getIntrinsicID(); + if (Intrinsic != Intrinsic::index_load && + Intrinsic != Intrinsic::index_store) + continue; + + bool IsLoad = (Intrinsic == Intrinsic::index_load); + unsigned PtrIdx = IsLoad ? 0 : 1; + Value *IdxOp = CI->getArgOperand(PtrIdx + 1); + unsigned NumVec, NumElts; + if (!isInterleaveIndices(IdxOp, NumVec, NumElts)) + continue; + + Value *Ptr = CI->getArgOperand(PtrIdx); + // Wide vector type of the index.load/index.store + VectorType *WideVecTy; + if (IsLoad) + WideVecTy = dyn_cast(CI->getType()); + else + WideVecTy = dyn_cast(CI->getArgOperand(0)->getType()); + // Type for each vector + VectorType *VecTy = VectorType::get(WideVecTy->getElementType(), NumElts); + if (!TTI->isLegalInterleaveType(VecTy)) + continue; + + Changed = true; + Intrinsic::ID Id = (Intrinsic::ID)(getIntrinsicID(NumVec, IsLoad)); + unsigned AddressSpace = + cast(Ptr->getType())->getPointerAddressSpace(); + Type *Tys[2] = {VecTy, VecTy->getPointerTo(AddressSpace)}; + Value *Callee = Intrinsic::getDeclaration(M, Id, Tys); + + Builder.SetInsertPoint(CI); + Value *NewPtr = + Builder.CreateBitCast(Ptr, VecTy->getPointerTo(AddressSpace)); + if (IsLoad) { + CallInst *CallI = Builder.CreateCall(Callee, NewPtr, "ldN"); + SmallVector Vectors; + for (unsigned i = 0; i < NumVec; i++) + Vectors.push_back(Builder.CreateExtractValue(CallI, i)); + + // Concatenate apart vectors from the new ldN intrinsic call to a wide + // vector. + Value *LargeVec = ConcateVectors(Builder, Vectors, 0, NumVec); + CI->replaceAllUsesWith(LargeVec); + CI->eraseFromParent(); + } else { + Value *IdxVec = CI->getArgOperand(0); + Value *UndefV = UndefValue::get(WideVecTy); + + // Split the wide vector of index store to apart vectors for the new + // stN intrinsic call. + SmallVector Ops; + for (unsigned i = 0; i < NumVec; i++) { + Constant *Mask = getSequentialIndices(Builder, NumElts * i, NumElts, 0); + Ops.push_back(Builder.CreateShuffleVector(IdxVec, UndefV, Mask)); + } + Ops.push_back(NewPtr); + Builder.CreateCall(Callee, Ops); + CI->eraseFromParent(); + } + } + + return Changed; +} + +bool AArch64InterleaveAccess::runOnFunction(Function &F) { + DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n"); + TTI = &getAnalysis().getTTI(F); + M = F.getParent(); + DL = &M->getDataLayout(); + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) + Changed |= castInterleaveIntrinsics(B); + return Changed; +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -214,6 +214,9 @@ // ourselves. addPass(createAtomicExpandPass(TM)); + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64InterleaveAccessPass()); + // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in // ldrex/strex loops to simplify this, but it needs tidying up. Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,11 @@ bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + unsigned getInterleaveMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, unsigned AddressSpace); + bool supportInterleaveAccess(); + bool isLegalInterleaveType(Type *DataTy); + bool isLegalInterleaveNum(unsigned NumVec); /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -407,6 +407,28 @@ return LT.first; } +unsigned AArch64TTIImpl::getInterleaveMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { + if (!isLegalInterleaveType(SrcTy)) + // To calculate scalar take the regular cost, without mask + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + return 1; +} + +bool AArch64TTIImpl::isLegalInterleaveNum(unsigned NumVec) { + return NumVec > 1 && NumVec < 5; +} + +bool AArch64TTIImpl::supportInterleaveAccess() { return true; } + +bool AArch64TTIImpl::isLegalInterleaveType(Type *VecType) { + VectorType *VecTy = dyn_cast(VecType); + if (!VecTy) + return false; + return isTypeLegal(VecTy); +} + unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { unsigned Cost = 0; for (auto *I : Tys) { Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -32,6 +32,7 @@ AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp + AArch64InterleaveAccess.cpp AArch64LoadStoreOptimizer.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -351,6 +351,8 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + void vectorizeInterleaveAccesses(Instruction *Instr, Value *Ptr, Type *VecTy); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -568,6 +570,14 @@ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} + ~LoopVectorizationLegality() { + SmallSet DeleteSet; + for (auto &I : InterleaveAccessMap) + DeleteSet.insert(dyn_cast(I.second)); + for (auto *Ptr : DeleteSet) + delete Ptr; + } + /// This enum represents the kinds of reductions that we support. enum ReductionKind { RK_NoReduction, ///< Not a reduction. @@ -702,6 +712,49 @@ ConstantInt *StepValue; }; + // An interleave access list contains interleaved accesses. + // E.g. for (i = 0; i < N; i++) { + // even = A[i*2] + // odd = A[i*2 + 1] + // ... // operations on even/odd + // } + // The InterleaveAccessList consists of two access: load even, load odd. + struct InterleaveAccessList { + InterleaveAccessList(int Stride, bool IsLoad) + : Stride(Stride), IsLoad(IsLoad) {} + InterleaveAccessList() : Stride(0) {} + + unsigned getIndex(Instruction *Access) { + unsigned Num = AccessList.size(); + for (unsigned i = 0; i < Num; i++) + if (AccessList[i] == Access) + return i; + return Num; + } + + bool isInsertPos(Instruction *Access) { return Access == InsertPos; } + + int Stride; + bool IsLoad; + unsigned Align; + // The interleaved access instruction list. + SmallVector AccessList; + // To avoid breaking the dependences of each access, the newly creately + // interleave access should be insert at the position of: + // (1) The first load access. + // or + // (2) The last store access. + // + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 + // + // store i32 %even + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; + }; + /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. typedef DenseMap ReductionList; @@ -796,6 +849,15 @@ unsigned getNumPredStores() const { return NumPredStores; } + + bool isLegalInterleaveAccess(Instruction *I, Type *VecTy) { + return InterleaveAccessMap.count(I) && TTI->isLegalInterleaveType(VecTy); + } + + InterleaveAccessList *getInterleaveAccessList(Instruction *I) { + return InterleaveAccessMap[I]; + } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -844,6 +906,17 @@ /// invariant. void collectStridedAccess(Value *LoadOrStoreInst); + /// Collect constant strided memory access. + void collectConstStridedAccess(MapVector &Strides); + + /// Analyze interleave accesses. + void analyzeInterleaveAccesses(); + + /// For a queue of accesses with the same kind, stride and base pointer, + /// find interleaved accesses. + void collectInterleaveAccesses(SmallVector &StrideQueue, + int Stride); + /// Report an analysis message to assist the user in diagnosing loops that are /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns @@ -902,6 +975,9 @@ /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; + + /// Interleave memory data. + DenseMap InterleaveAccessMap; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1737,6 +1813,174 @@ "reverse"); } +// Create the interleaved indices vector: +// 0, Num, Num*2, ..., 1, Num + 1, Num*2 + 1, ... +// E.g. For 2 interleaved vectors of <4 x i32> type, the indices should be: +// 0, 2, 4, 6, 1, 3, 5, 7 +static Constant *getInterleaveIndices(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Indices; + for (unsigned i = 0; i < NumVec; i++) + for (unsigned j = 0; j < VF; j++) + Indices.push_back(Builder.getInt32(j * NumVec + i)); + return ConstantVector::get(Indices); +} + +// Get a indices vector starts with sequential constant and ends with UNDEFs: +// Idx, Idx + 1, Idx + 2, ..., Idx + Num - 1, undef, undef, ..., undef. +static Constant *getSequentialIndices(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Indices; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Indices.push_back(Builder.getInt32(i)); + if (NumUndef == 0) + return ConstantVector::get(Indices); + + Constant *UndefVal = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Indices.push_back(UndefVal); + return ConstantVector::get(Indices); +} + +// Concatenate two vectors with the same element type. The first vector has +// not less elements as the second vector. +static Value *ConcateTwoVectors(IRBuilder<> &Builder, Value *V1, Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + if (NumElts1 == NumElts2) { + Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 * 2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); + } + + assert(NumElts1 > NumElts2 && "Expect the first vector has more elements"); + // Extend V2 to be the same vector type as V1 with undef elements. + Constant *ExtMask = + getSequentialIndices(Builder, 0, NumElts2, NumElts1 - NumElts2); + Value *UndefV = UndefValue::get(VecTy2); + Value *ExtV2 = Builder.CreateShuffleVector(V2, UndefV, ExtMask); + + Constant *Mask = getSequentialIndices(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, ExtV2, Mask); +} + +// Concatenate NumVec of vectors in VecList start at Idx +static Value *ConcateVectors(IRBuilder<> &Builder, + SmallVector &VecList, unsigned Idx, + unsigned NumVec) { + assert(NumVec != 0 && "Number of vectors should not be zero"); + if (NumVec == 1) + return VecList[Idx]; + + unsigned NumPart = PowerOf2Floor(NumVec); + // When the number of vectors is not power of 2, split the list and make sure + // the first list has power of 2 elements. + // E.g. To concate 7 vectors, split into two lists with 4 and 3 elements. + if (NumPart == NumVec) + NumPart /= 2; + Value *V0 = ConcateVectors(Builder, VecList, Idx, NumPart); + unsigned NextIdx = Idx + NumPart; + Value *V1 = ConcateVectors(Builder, VecList, NextIdx, NumVec - NumPart); + return ConcateTwoVectors(Builder, V0, V1); +} + +// Create the vectorized instruction for interleave access. +// E.g. If VF is 4, transform: +// %S1 = load i32* %ptr1 +// %S2 = load i32* %ptr2 +// To +// %IdxV = call <8 x i32> @index.load(i8* %ptr, <0, 2, 4, 6, 1, 3, 5, 7>) +// %V1 = shufflevector <8 x i32> %IdxV, <8 x i32> undef, <0, 1, 2, 3> +// %V2 = shufflevector <8 x i32> %IdxV, <8 x i32> undef, <4, 5, 6, 7> +void InnerLoopVectorizer::vectorizeInterleaveAccesses(Instruction *Instr, + Value *Ptr, + Type *DataTy) { + VectorType *VecTy = dyn_cast(DataTy); + assert(VecTy && "expect a vector type"); + + LoopVectorizationLegality::InterleaveAccessList *IA = + Legal->getInterleaveAccessList(Instr); + assert(IA->InsertPos == Instr && "unexpected insert point"); + + unsigned Idx = IA->getIndex(Instr); + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); + Type *I8PtrTy = Builder.getInt8PtrTy(AddressSpace); + Constant *Zero = Builder.getInt32(0); + + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + for (unsigned Part = 0; Part < UF; Part++) { + // Adjust pointer to the index 0 access. + Constant *Offset = Builder.getInt32(-static_cast(Idx)); + Value *NewPtr = Builder.CreateExtractElement(PtrParts[Part], Zero); + NewPtr = Builder.CreateGEP(NewPtr, Offset); + NewPtr = Builder.CreateBitCast(NewPtr, I8PtrTy); + NewPtrs.push_back(NewPtr); + } + + SmallVector &AccessList = IA->AccessList; + unsigned Align = IA->Align; + unsigned NumVec = AccessList.size(); + Type *EleTy = VecTy->getElementType(); + Type *WideVecTy = VectorType::get(EleTy, NumVec * VF); + bool IsElePtr = EleTy->isPointerTy(); + Type *IntPtrVecTy; + if (IsElePtr) { + const DataLayout &DL = Instr->getModule()->getDataLayout(); + Type *IntPtrTy = DL.getIntPtrType(EleTy); + IntPtrVecTy = VectorType::get(IntPtrTy, NumVec * VF); + } + + setDebugLocFromInst(Builder, Instr); + if (IA->IsLoad) { + for (unsigned Part = 0; Part < UF; Part++) { + // If the element is pointer, call index.load with an integer vector type + // and cast the result to pointer vector. + Type *RetVecTy = IsElePtr ? IntPtrVecTy : WideVecTy; + + // Call index.load to read data from memory, with deinterleaving. + Constant *Mask = getInterleaveIndices(Builder, VF, NumVec); + Value *IdxVec = Builder.CreateIndexLoad(RetVecTy, NewPtrs[Part], Mask, + Align, "interleave.load"); + propagateMetadata(dyn_cast(IdxVec), Instr); + if (IsElePtr) + IdxVec = Builder.CreateIntToPtr(IdxVec, WideVecTy); + + // Split the large index load vector into small vectors for apart load. + Value *UndefV = UndefValue::get(WideVecTy); + for (unsigned i = 0; i < NumVec; i++) { + VectorParts &Entry = WidenMap.get(AccessList[i]); + Constant *Indices = getSequentialIndices(Builder, VF * i, VF, 0); + Entry[Part] = Builder.CreateShuffleVector(IdxVec, UndefV, Indices); + } + } + return; + } + + for (unsigned Part = 0; Part < UF; Part++) { + SmallVector StoredVal; + for (unsigned i = 0; i < NumVec; i++) + StoredVal.push_back(getVectorValue(AccessList[i]->getOperand(0))[Part]); + // Concatenate the apart vectors into a large vector for index.store. + Value *IdxVec = ConcateVectors(Builder, StoredVal, 0, NumVec); + + // If the element is pointer, cast the pointer vector to integer pointer. + if (IsElePtr) + IdxVec = Builder.CreatePtrToInt(IdxVec, IntPtrVecTy); + + // Call index.store write data to memory, with reinterleaving. + Constant *Mask = getInterleaveIndices(Builder, VF, NumVec); + CallInst *CI = Builder.CreateIndexStore(IdxVec, NewPtrs[Part], Mask, Align); + propagateMetadata(CI, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1764,6 +2008,15 @@ if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); + if (Legal->isLegalInterleaveAccess(Instr, DataTy)) { + LoopVectorizationLegality::InterleaveAccessList *IA = + Legal->getInterleaveAccessList(Instr); + // Skip the access that is not the insert position. + if (!IA->isInsertPos(Instr)) + return; + return vectorizeInterleaveAccesses(Instr, Ptr, DataTy); + } + // If the pointer is loop invariant or if it is non-consecutive, // scalarize the load. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -1859,7 +2112,7 @@ if (Legal->isMaskRequired(SI)) NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, Mask[Part]); - else + else NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); propagateMetadata(NewSI, SI); } @@ -3570,6 +3823,9 @@ // Collect all of the variables that remain uniform after vectorization. collectLoopUniforms(); + if (TTI->supportInterleaveAccess()) + analyzeInterleaveAccesses(); + DEBUG(dbgs() << "LV: We can vectorize this loop" << (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : "") @@ -3581,6 +3837,313 @@ return true; } +static bool isInBoundsGep(Value *Ptr) { + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + return GEP->isInBounds(); + return false; +} + +/// \brief Check whether the access through \p Ptr has a constant stride. +static int isStridedPtr(ScalarEvolution *SE, Value *Ptr, const Loop *Lp, + const SCEV *&PtrScev) { + const Type *Ty = Ptr->getType(); + assert(Ty->isPointerTy() && "Unexpected non-ptr"); + const PointerType *PtrTy = cast(Ty); + + PtrScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PtrScev); + // The pointer must be an AddRecExpr pointer and the accesss function must + // stride over the innermost loop. + if (!AR || Lp != AR->getLoop()) + return 0; + + // The address calculation must not wrap. Otherwise, a dependence could be + // inverted. + // An inbounds getelementptr that is a AddRec with a unit stride + // cannot wrap per definition. The unit stride requirement is checked later. + // An getelementptr without an inbounds attribute and unit stride would have + // to access the pointer value "0" which is undefined behavior in address + // space 0, therefore we can also vectorize this case. + bool IsInBoundsGEP = isInBoundsGep(Ptr); + bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); + bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; + if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) + return 0; + + // Check the step is constant. + const SCEV *Step = AR->getStepRecurrence(*SE); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) + return 0; + + auto &DL = Lp->getHeader()->getModule()->getDataLayout(); + int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); + const APInt &APStepVal = C->getValue()->getValue(); + + // Huge step value - give up. + if (APStepVal.getBitWidth() > 64) + return 0; + + int64_t StepVal = APStepVal.getSExtValue(); + + // Strided access. + int64_t Stride = StepVal / Size; + int64_t Rem = StepVal % Size; + if (Rem) + return 0; + + // If the SCEV could wrap but we have an inbounds gep with a unit stride we + // know we can't "wrap around the address space". In case of address space + // zero we know that this won't happen without triggering undefined behavior. + if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && + Stride != 1 && Stride != -1) + return 0; + + return Stride; +} + +static Value *getPointerOperand(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +void LoopVectorizationLegality::collectConstStridedAccess( + MapVector &StridedAccesses) { + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + Value *Ptr = getPointerOperand(it); + if (!Ptr) + continue; + + const SCEV *PtrScev; + int Stride = isStridedPtr(SE, Ptr, TheLoop, PtrScev); + if (Stride < 2 && Stride > -2) + continue; + + DEBUG(dbgs() << "LV: Found a constant strided access\n"); + DEBUG(dbgs() << " pointer:" << *Ptr << "\n"); + DEBUG(dbgs() << " Stride:" << Stride << "\n"); + StridedAccesses[it] = Stride; + } + } +} + +static unsigned getAddressSpaceOperand(Instruction *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +static Value *getBasePointer(Instruction *I, const DataLayout &DL) { + Value *Ptr = getPointerOperand(I); + assert(Ptr && "Fail to get point from a memory instruction"); + return GetUnderlyingObject(Ptr, DL); +} + +static bool isConsecutiveAccess(Instruction *A, Instruction *B, + ScalarEvolution *SE, const DataLayout &DL) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + APInt OffsetDelta = OffsetB - OffsetA; + + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + APInt BaseDelta = Size - OffsetDelta; + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(BaseDelta); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} + +void LoopVectorizationLegality::collectInterleaveAccesses( + SmallVector &StrideQueue, int Stride) { + if (StrideQueue.size() < 2) { + StrideQueue.clear(); + return; + } + + SetVector Heads; + SetVector Tails; + SmallDenseMap ConsecutiveChain; + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (unsigned i = 0, e = StrideQueue.size(); i != e; i++) { + for (unsigned j = 0; j != e; j++) { + if (i == j) + continue; + if (isConsecutiveAccess(StrideQueue[i], StrideQueue[j], SE, DL)) { + Heads.insert(StrideQueue[i]); + Tails.insert(StrideQueue[j]); + ConsecutiveChain[StrideQueue[i]] = StrideQueue[j]; + } + } + } + + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + SmallSetVector Operands; + Instruction *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + Operands.insert(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + unsigned Size = Operands.size(); + unsigned NumVec = std::abs(Stride); + if (Size < NumVec) + continue; + + // If Size >= NumVec, there may be more than 1 interleave access. + for (unsigned Part = 0; Part < Size / NumVec; Part++) { + bool IsLoad = StrideQueue[0]->mayReadFromMemory(); + InterleaveAccessList *ID = new InterleaveAccessList(Stride, IsLoad); + unsigned MaxAlign = 1; + for (unsigned i = Part * NumVec; i < Part * NumVec + NumVec; i++) { + Instruction *Access = Operands[i]; + ID->AccessList.push_back(Access); + InterleaveAccessMap[Access] = ID; + + unsigned Align = IsLoad ? dyn_cast(Access)->getAlignment() + : dyn_cast(Access)->getAlignment(); + if (Align > MaxAlign) + MaxAlign = Align; + } + ID->Align = MaxAlign; + + if (IsLoad) { // Insert point is the first load access. + for (unsigned i = 0, e = StrideQueue.size(); i != e; i++) { + if (Operands.count(StrideQueue[i])) { + ID->InsertPos = StrideQueue[i]; + break; + } + } + } else { // Insert point is the last store access. + for (int i = StrideQueue.size() - 1, e = -1; i != e; i--) { + if (Operands.count(StrideQueue[i])) { + ID->InsertPos = StrideQueue[i]; + break; + } + } + } + + DEBUG(dbgs() << "LV: Found interleaved accesses with Stride " << Stride + << "\n"); + } + } + + StrideQueue.clear(); + return; +} + +void LoopVectorizationLegality::analyzeInterleaveAccesses() { + MapVector StridedAccesses; + collectConstStridedAccess(StridedAccesses); + + const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + if (blockNeedsPredication(*bb)) + continue; + + // Queue for the accesses of the same kind, Stride and Base pointer. + SmallVector StrideQueue; + bool TryMerge = false; + Value *CurBase = nullptr; + int CurStride; + bool IsLoad = false; + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + if (TryMerge) { + collectInterleaveAccesses(StrideQueue, CurStride); + CurBase = nullptr; + TryMerge = false; + } + + if (!it->mayReadOrWriteMemory()) + continue; + + // Break. Merge the queue. + if (!StridedAccesses.count(it)) { + TryMerge = true; + continue; + } + + Value *Base = getBasePointer(it, DL); + int Stride = StridedAccesses[it]; + bool Read = it->mayReadFromMemory(); + + if (CurBase == nullptr) { + // Skip the illegal number of interleaved access. + if (!TTI->isLegalInterleaveNum(std::abs(Stride))) + continue; + + StrideQueue.push_back(it); + CurBase = Base; + CurStride = Stride; + IsLoad = Read; + continue; + } + + // This instructure doesn't fit the current queue. Try to merge current + // queue and insert it again. + if (CurBase != Base || CurStride != Stride || IsLoad != Read) { + TryMerge = true; + --it; + continue; + } + + StrideQueue.push_back(it); + } + + if (StrideQueue.size() > 1) + collectInterleaveAccesses(StrideQueue, CurStride); + } +} + static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -5103,7 +5666,9 @@ const DataLayout &DL = I->getModule()->getDataLayout(); unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; - if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { + bool IsInterleave = Legal->isLegalInterleaveAccess(I, VectorTy); + if ((!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) && + !IsInterleave) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); unsigned Cost = 0; @@ -5129,11 +5694,16 @@ // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); - else + } else if (IsInterleave) { + Alignment = Legal->getInterleaveAccessList(I)->Align; + Cost += TTI.getInterleaveMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + } else { Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + } if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, Index: test/CodeGen/AArch64/interleaved-access-to-ldN-stN.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/interleaved-access-to-ldN-stN.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + + +; Make sure the intrinsic about 2 interleaved vectors can be matched +; CHECK-LABEL: test_ld2_st2: +; CHECK: ld2 +; CHECK: st2 + +define void @test_ld2_st2(i8* %ptr) { +entry: + %interleave.load = call <8 x i32> @llvm.index.load.v8i32(i8* %ptr, <8 x i32> , i32 4) + %0 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> + %1 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> + %2 = add nsw <4 x i32> %0, + %3 = add nsw <4 x i32> %1, + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + call void @llvm.index.store.v8i32(<8 x i32> %4, i8* %ptr, <8 x i32> , i32 4) + ret void +} + +; Make sure the intrinsic about 3 interleaved vectors can be matched +; CHECK-LABEL: test_ld3_st3: +; CHECK: ld3 +; CHECK: st3 + +define void @test_ld3_st3(i8* %ptr) { +entry: + %interleave.load = call <12 x i32> @llvm.index.load.v12i32(i8* %ptr, <12 x i32> , i32 4) + %0 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> + %1 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> + %2 = shufflevector <12 x i32> %interleave.load, <12 x i32> undef, <4 x i32> + %3 = add nsw <4 x i32> %0, + %4 = add nsw <4 x i32> %1, + %5 = add nsw <4 x i32> %2, + %6 = shufflevector <4 x i32> %3, <4 x i32> %4, <8 x i32> + %7 = shufflevector <4 x i32> %5, <4 x i32> undef, <8 x i32> + %8 = shufflevector <8 x i32> %6, <8 x i32> %7, <12 x i32> + call void @llvm.index.store.v12i32(<12 x i32> %8, i8* %ptr, <12 x i32> , i32 4) + ret void +} + +; Make sure the intrinsic about 3 interleaved vectors can be matched +; CHECK-LABEL: test_ld4_st4: +; CHECK: ld4 +; CHECK: st4 + +define void @test_ld4_st4(i8* %ptr) { +entry: + %interleave.load = call <16 x i32> @llvm.index.load.v16i32(i8* %ptr, <16 x i32> , i32 4) + %0 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> + %1 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> + %2 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> + %3 = shufflevector <16 x i32> %interleave.load, <16 x i32> undef, <4 x i32> + %4 = add nsw <4 x i32> %0, + %5 = add nsw <4 x i32> %1, + %6 = add nsw <4 x i32> %2, + %7 = add nsw <4 x i32> %3, + %8 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %9 = shufflevector <4 x i32> %6, <4 x i32> %7, <8 x i32> + %10 = shufflevector <8 x i32> %8, <8 x i32> %9, <16 x i32> + call void @llvm.index.store.v16i32(<16 x i32> %10, i8* %ptr, <16 x i32> , i32 4) + ret void +} + +declare <8 x i32> @llvm.index.load.v8i32(i8*, <8 x i32>, i32) +declare void @llvm.index.store.v8i32(<8 x i32>, i8*, <8 x i32>, i32) +declare <12 x i32> @llvm.index.load.v12i32(i8*, <12 x i32>, i32) +declare void @llvm.index.store.v12i32(<12 x i32>, i8*, <12 x i32>, i32) +declare <16 x i32> @llvm.index.load.v16i32(i8*, <16 x i32>, i32) +declare void @llvm.index.store.v16i32(<16 x i32>, i8*, <16 x i32>, i32) Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -102,26 +102,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> +; CHECK: shufflevector <8 x i32> %[[IL1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[IL1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[IL:.*]] = call <4 x i32> @llvm.index.load.v4i32(i8* {{.*}}, <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[IL]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[IL]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/AArch64/interleaved-access.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/interleaved-access.ll @@ -0,0 +1,358 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -runtime-memory-check-threshold=24 < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -runtime-memory-check-threshold=24 < %s | FileCheck %s --check-prefix=UNROLL + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnueabi" + +; Make sure that we can vectorize the interleave access to a struct. +; struct ST2 { +; int x; +; int y; +; }; +; +; int test_struct_load2(struct ST2 *A, int n) { +; int sum = 0; +; for (int i = 0; i < 1024; ++i) { +; sum += A[i].x; +; sum += A[i].y; +; } +; +; return sum; +; } + +; CHECK-LABEL: @test_struct_load2( +; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> , i32 4) +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> + +; UNROLL-LABEL: @test_struct_load2( +; UNROLL: %[[IL0:.*]] = call <8 x i32> @llvm.index.load.v8i32 +; UNROLL: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32 + +%struct.ST2 = type { i32, i32 } +define i32 @test_struct_load2(%struct.ST2* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %add = add nsw i32 %0, %sum + %add1 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %add1.lcssa = phi i32 [ %add1, %for.body ] + ret i32 %add1.lcssa +} + + +; Make sure that the following case can be vectorized. +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %[[IL:.*]] = call <8 x i32> @llvm.index.load.v8i32(i8* {{.*}}, <8 x i32> , i32 4) +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[IL]], <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %[[VEC:.*]] = shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <8 x i32> +; CHECK: call void @llvm.index.store.v8i32(<8 x i32> %[[VEC]], i8* {{.*}}, <8 x i32> , i32 4) + +; UNROLL-LABEL: @test_array_load2_store2( +; UNROLL: %[[IL0:.*]] = call <8 x i32> @llvm.index.load.v8i32 +; UNROLL: %[[IL1:.*]] = call <8 x i32> @llvm.index.load.v8i32 +; UNROLL: call void @llvm.index.store.v8i32 +; UNROLL: call void @llvm.index.store.v8i32 + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; }; +; int test_struct_load3(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load3( +; CHECK: %[[IL:.*]] = call <12 x i32> @llvm.index.load.v12i32(i8* {{.*}}, <12 x i32> , i32 4) +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> + +; UNROLL-LABEL: @test_struct_load3( +; UNROLL: call <12 x i32> @llvm.index.load.v12i32 +; UNROLL: call <12 x i32> @llvm.index.load.v12i32 + +%struct.ST3 = type { i32, i32, i32 } + +define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ] + %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.015 + %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add5 +} + + +; Make sure that the following case can be vectorized. +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %[[IL:.*]] = call <12 x i32> @llvm.index.load.v12i32(i8* {{.*}}, <12 x i32> , i32 4) +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %[[IL]], <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: call void @llvm.index.store.v12i32(<12 x i32> {{.*}}, i8* {{.*}}, <12 x i32> , i32 4) + +; UNROLL-LABEL: @test_struct_array_load3_store3( +; UNROLL: call <12 x i32> @llvm.index.load.v12i32 +; UNROLL: call <12 x i32> @llvm.index.load.v12i32 +; UNROLL: call void @llvm.index.store.v12i32 +; UNROLL: call void @llvm.index.store.v12i32 + +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %[[IL:.*]] = call <16 x i32> @llvm.index.load.v16i32(i8* {{.*}}, <16 x i32> , i32 4) +; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %[[IL]], <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +; UNROLL-LABEL: @test_struct_load4( +; UNROLL: call <16 x i32> @llvm.index.load.v16i32 +; UNROLL: call <16 x i32> @llvm.index.load.v16i32 + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: call void @llvm.index.store.v16i32(<16 x i32> {{.*}}, i8* {{.*}}, <16 x i32> , i32 4) + +; UNROLL-LABEL: @test_struct_store4( +; UNROLL: load <4 x i32>, <4 x i32> +; UNROLL: load <4 x i32>, <4 x i32> +; UNROLL: call void @llvm.index.store.v16i32 +; UNROLL: call void @llvm.index.store.v16i32 + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}